#Luis Enrique García
#Cod. 200111907
#Login Uniandes: Luis.garcial@uniandes.edu.co
# “Al entregar la solución de este parcial, yo, LUIS ENRIQUE GARCIA LEYVA con código 200111907 me comprometo a
# no conversar durante el desarrollo de este examen con ninguna persona que no sea el profesor del curso,
# sobre aspectos relacionados con el parcial; tampoco utilizaré algún medio de comunicación por voz, texto o
# intercambio de archivos, para consultar o compartir con otros, información sobre el tema del parcial. Soy
# consciente y acepto las consecuencias que acarreará para mi desempeño académico cometer fraude en este
# parcial”.
import pandas as pd, sklearn as sl, numpy as np, matplotlib.pyplot as plt, seaborn as sns
import pylev
import re
import warnings
warnings.filterwarnings("ignore")
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap.umap_ import UMAP
from random import randint
from datetime import datetime
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error
from pandas_profiling import ProfileReport
%matplotlib inline
# Se carga el dataFrame
LosAlpesHistorydf = pd.read_csv(r"C:\Users\LGARCIA\OneDrive - Valorem\LGARCIA\Documents\04. Estudio\05. Maestria\Ciencia Aplicada de Datos\CAD_Parcial1\Data\losalpes_history.csv")
#PUNTO 1: Un análisis exploratorio y de calidad de datos a partir de los datos históricos suministrados. En la medida de lo posible, incluya
# técnicas de análisis univariado,bivariado, gráficas y no gráficas. Realice la limpieza y preparación correspondiente para que puedan ser consumidos
# por los algoritmos de machine learning que va a utilizar. Adjunte la evidencia de como quedan los datos preparados, explique y justifique el proceso implementado
# Antes de empezar a entrenar el modelo vamos a revisar una limpieza de datos variable a variable empezando por las variables tipo object: Estandarizando
# los valores y quitando valores NaN. Posteriormente a esto, revisaremos los valores númericos para quitar valores incoherentes y valores Nan. Por último
# ELiminaremos registros duplicados.
LosAlpesHistorydf.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 102083 entries, 0 to 102082 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 102083 non-null int64 1 neighbourhood group 101463 non-null object 2 neighbourhood 101476 non-null object 3 lat 101484 non-null float64 4 long 101484 non-null float64 5 country 100967 non-null object 6 instant_bookable 101387 non-null object 7 cancellation_policy 101416 non-null object 8 room type 101492 non-null object 9 construction year 101279 non-null float64 10 price 101245 non-null object 11 service fee 101220 non-null object 12 minimum nights 101089 non-null float64 13 availability 365 101049 non-null float64 14 number of reviews 101313 non-null float64 15 review rate number 101174 non-null float64 dtypes: float64(7), int64(1), object(8) memory usage: 12.5+ MB
# Limpieza #1: Variable Neighbourhood group. Empezaremos por quitar los valores NaN. Posteriormente a esto revisaremos los posibles valores y
# definiremos el conjunto de valores posibles. Posteriormente a esto mediante la función levenshtein llevaremos los valores encontrados hacia
# el conjunto definido.
LosAlpesHistorydf['neighbourhood group'].unique()
array(['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'Staten Island', nan,
'Quens', 'Manattan', 'Broolkyn', 'Manhatan', 'brookln', 'manhatan'],
dtype=object)
LosAlpesHistorydf['neighbourhood group']=LosAlpesHistorydf['neighbourhood group'].replace(np.nan, 'No Information')
neighbourhood_group_list = ['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'State Island','No Information']
LosAlpesHistorydf.loc[~LosAlpesHistorydf["neighbourhood group"].isin(neighbourhood_group_list), "neighbourhood group"].unique()
array(['Staten Island', 'Quens', 'Manattan', 'Broolkyn', 'Manhatan',
'brookln', 'manhatan'], dtype=object)
def corregirneighbourhoodGroup1(x):
distancia = 1
if pylev.levenshtein('Bronx', x) <= distancia:
return 'Bronx'
elif pylev.levenshtein('Brooklyn', x) <= distancia:
return 'Brooklyn'
elif pylev.levenshtein('Manhattan', x) <= distancia:
return 'Manhattan'
elif pylev.levenshtein('Queens', x) <= distancia:
return 'Queens'
elif pylev.levenshtein('Staten Island', x) <= distancia:
return 'State Island'
elif pylev.levenshtein('No Information', x) <= distancia:
return 'No Information'
else:
return x
def corregirneighbourhoodGroup2(x):
distancia = 2
if pylev.levenshtein('Bronx', x) <= distancia:
return 'Bronx'
elif pylev.levenshtein('Brooklyn', x) <= distancia:
return 'Brooklyn'
elif pylev.levenshtein('Manhattan', x) <= distancia:
return 'Manhattan'
elif pylev.levenshtein('Queens', x) <= distancia:
return 'Queens'
elif pylev.levenshtein('Staten Island', x) <= distancia:
return 'State Island'
elif pylev.levenshtein('No Information', x) <= distancia:
return 'No Information'
else:
return x
LosAlpesHistorydf.loc[~LosAlpesHistorydf["neighbourhood group"].isin(neighbourhood_group_list), "neighbourhood group"].unique()
array(['Staten Island', 'Quens', 'Manattan', 'Broolkyn', 'Manhatan',
'brookln', 'manhatan'], dtype=object)
LosAlpesHistorydf['neighbourhood group']=LosAlpesHistorydf['neighbourhood group'].apply(corregirneighbourhoodGroup1)
LosAlpesHistorydf.loc[~LosAlpesHistorydf["neighbourhood group"].isin(neighbourhood_group_list), "neighbourhood group"].unique()
array(['Broolkyn', 'brookln', 'manhatan'], dtype=object)
LosAlpesHistorydf['neighbourhood group']=LosAlpesHistorydf['neighbourhood group'].apply(corregirneighbourhoodGroup2)
LosAlpesHistorydf.loc[~LosAlpesHistorydf["neighbourhood group"].isin(neighbourhood_group_list), "neighbourhood group"].unique()
array([], dtype=object)
LosAlpesHistorydf['neighbourhood group'].unique()
array(['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'State Island',
'No Information'], dtype=object)
# Limpieza #2: Vamos a verificar la Variable neighbourhood que esperamos tenga un volumen alto de posibles valores.
LosAlpesHistorydf['neighbourhood'].unique()
array(["Hell's Kitchen", 'Midtown', 'Upper West Side',
'Financial District', 'Lower East Side', 'Greenwich Village',
'Morningside Heights', 'Crown Heights', 'Astoria', 'Clinton Hill',
'Harlem', 'Bushwick', 'Washington Heights', 'Woodside',
'Prospect-Lefferts Gardens', 'Prospect Heights', 'East Village',
'Bath Beach', 'Brooklyn Heights', 'Borough Park',
'Bedford-Stuyvesant', 'Upper East Side', 'Wakefield',
'Windsor Terrace', 'Williamsburg', 'Flatbush', 'Kew Gardens',
'West Village', 'Flatlands', 'Bay Ridge', 'East New York',
'Kips Bay', 'St. Albans', 'St. George', 'Forest Hills', 'Gramercy',
'Glendale', 'Dongan Hills', 'Far Rockaway', 'Park Slope',
'Greenpoint', 'Concourse', 'Little Italy', nan, 'Elmhurst',
'Chelsea', 'East Harlem', 'Kingsbridge', 'Arverne', 'Sunset Park',
'South Slope', 'Nolita', 'East Flatbush', 'Murray Hill',
'Boerum Hill', 'Ridgewood', 'Coney Island', 'Gowanus',
'Bensonhurst', 'Cambria Heights', 'Cobble Hill',
'Long Island City', 'Flushing', 'Tribeca', 'Inwood', 'Briarwood',
'Spuyten Duyvil', 'Concord', 'NoHo', 'Randall Manor', 'Fordham',
'Springfield Gardens', 'Fort Greene', 'Corona', 'Ditmars Steinway',
'Cypress Hills', 'Chinatown', 'East Elmhurst', 'Flatiron District',
'Jamaica', 'Sheepshead Bay', 'Downtown Brooklyn', 'Edenwald',
'Kensington', 'Oakwood', 'Eltingville', 'Theater District',
'Gravesend', 'Schuylerville', 'SoHo', 'Middle Village', 'Canarsie',
'Richmond Hill', 'Sunnyside', 'Jackson Heights', 'Parkchester',
'Mott Haven', 'Maspeth', 'Stapleton', 'Carroll Gardens',
'Midland Beach', 'Midwood', 'Van Nest', 'Rego Park',
'Fort Hamilton', 'University Heights', 'Woodhaven',
'Brighton Beach', 'Rosebank', 'Williamsbridge', 'Port Morris',
'Vinegar Hill', 'Whitestone', 'Bayside', 'Longwood',
'Port Richmond', 'Laurelton', 'Roosevelt Island', 'Tremont',
'Throgs Neck', 'Unionport', 'Olinville', 'Civic Center',
'City Island', 'Red Hook', 'Clason Point', 'Battery Park City',
'Highbridge', 'Queens Village', 'Todt Hill', 'Rockaway Beach',
'Pelham Gardens', 'Rossville', 'Concourse Village', 'New Brighton',
'Brownsville', 'Baychester', 'South Ozone Park', 'Bronxdale',
'Hunts Point', 'Tompkinsville', 'Ozone Park', 'College Point',
'Mount Eden', 'DUMBO', 'Morris Park', 'Gerritsen Beach',
'Fresh Meadows', 'Westchester Square', 'Marble Hill',
'Shore Acres', 'Jamaica Hills', 'Kew Gardens Hills', 'Two Bridges',
'Claremont Village', 'Bayswater', 'Columbia St', 'Morrisania',
'Jamaica Estates', 'Belle Harbor', 'Howard Beach',
'Lighthouse Hill', 'Norwood', 'Mariners Harbor', 'Rosedale',
'Navy Yard', "Bull's Head", 'Fieldston', 'North Riverdale',
'Bergen Beach', 'Manhattan Beach', 'Grant City', 'Douglaston',
'Emerson Hill', 'Bay Terrace, Staten Island', 'Eastchester',
'Woodlawn', "Prince's Bay", 'Allerton', 'Stuyvesant Town',
'Belmont', 'Castle Hill', 'Arrochar', 'Edgemere',
'Castleton Corners', 'Grymes Hill', 'Pelham Bay',
'East Morrisania', 'Riverdale', 'Bellerose', 'South Beach',
'Hollis', 'Arden Heights', 'Clifton', 'West Brighton',
'Mount Hope', 'Howland Hook', 'Little Neck', 'Dyker Heights',
'Soundview', 'Sea Gate', 'Morris Heights', 'Breezy Point',
'Neponsit', 'Melrose', 'New Dorp Beach', 'Co-op City', 'Huguenot',
'Great Kills', 'New Springville', 'Mill Basin', 'West Farms',
'Chelsea, Staten Island', 'Bay Terrace', 'Tottenville',
'Holliswood', 'Glen Oaks', 'Richmondtown', 'Silver Lake',
'Graniteville', 'Willowbrook', 'Westerleigh', 'Woodrow',
'New Dorp', 'Fort Wadsworth'], dtype=object)
# Al ver la gran variedad de valores diferentes, vamos a hacer una remediación sencilla. Vamos a quitar
# espacios.
LosAlpesHistorydf['neighbourhood'].str.strip()
0 Hell's Kitchen
1 Midtown
2 Upper West Side
3 Financial District
4 Lower East Side
...
102078 Prospect Heights
102079 Williamsburg
102080 Rosedale
102081 Upper West Side
102082 Long Island City
Name: neighbourhood, Length: 102083, dtype: object
# Limpieza #3: Pasamos a la siguiente variable. El pais.
LosAlpesHistorydf['country'].unique()
array(['United States', nan, 'United States of America'], dtype=object)
# Vemos que solo hay una posibilidad "Estados Unidos", así que la vamos asignar a la fuerza. Aunque de inicio vemos que no va a ser una variable
# importante para el modelo de predicción.
# Hacemos un Replace Directo sobre el campo para la limpieza de datos univariable.
LosAlpesHistorydf['country'].replace('United States of America','United States', inplace=True)
LosAlpesHistorydf['country']="United States"
LosAlpesHistorydf['country'].unique()
array(['United States'], dtype=object)
# Limpieza #4: Verificamos la variable Reserva Instantanea. En esta se esperan dos posibles valores True or False. En caso de valores NaN, estos serán
# reemplazados por False
LosAlpesHistorydf['instant_bookable'].unique()
array([True, False, nan], dtype=object)
LosAlpesHistorydf['instant_bookable']=LosAlpesHistorydf['instant_bookable'].replace(np.nan, False)
LosAlpesHistorydf['instant_bookable'].unique()
array([ True, False])
# Limpieza #5: Verificamos la variable Politicas de Cancelación. Esperamos un conjunto de valores reducido. En caso de tener valores Nan
# se reemplazarán por "Sin información".
LosAlpesHistorydf['cancellation_policy'].unique()
array(['strict', 'flexible', 'moderate', nan], dtype=object)
LosAlpesHistorydf['cancellation_policy']=LosAlpesHistorydf['cancellation_policy'].replace(np.nan, "No Information")
LosAlpesHistorydf['cancellation_policy'].unique()
array(['strict', 'flexible', 'moderate', 'No Information'], dtype=object)
# Limpieza #6: Verificamos la variable Tipo de Habitación. Esperamos un conjunto de valores reducido. En caso de tener valores Nan
# se reemplazarán por "Sin información".
LosAlpesHistorydf['room type'].unique()
array(['Entire home/apt', 'Hotel room', 'Private room', 'Shared room',
nan], dtype=object)
LosAlpesHistorydf['room type']=LosAlpesHistorydf['room type'].replace(np.nan, "No Information")
LosAlpesHistorydf['room type'].unique()
array(['Entire home/apt', 'Hotel room', 'Private room', 'Shared room',
'No Information'], dtype=object)
# Limpieza #7: Nos causa curiosidad que los campos precio y Service Fee sean strings y no campos numericos, así que solo
# para verificar voy a sacar un unique y ver todos los valores
LosAlpesHistorydf['price'].unique()
array(['$ 851', '$ 466', '$ 874', '$ 813', '$ 326', '$ 786', '$ 863',
'$ 749', '$ 397', '$ 462', '$ 121', '$ 1,08', '$ 443', '$ 707',
'$ 925', '$ 997', '$ 887', '$ 760', '$ 671', '$ 1,15', '$ 277',
'$ 1,01', '$ 389', '$ 799', '$ 1,06', '$ 828', '$ 833', '$ 306',
'$ 115', '$ 836', '$ 80', '$ 198', '$ 866', '$ 555', '$ 1,04',
'$ 1,14', '$ 279', '$ 685', '$ 381', '$ 784', '$ 723', '$ 103',
'$ 215', '$ 992', '$ 592', '$ 164', '$ 90', '$ 781', '$ 404',
'$ 72', '$ 175', '$ 494', '$ 960', '$ 272', '$ 232', '$ 1,16',
'$ 428', '$ 780', '$ 223', '$ 112', '$ 155', '$ 104', '$ 902',
'$ 865', '$ 119', '$ 501', '$ 607', '$ 1,02', '$ 283', '$ 271',
'$ 276', '$ 541', nan, '$ 593', '$ 212', '$ 1,19', '$ 838',
'$ 553', '$ 172', '$ 1,03', '$ 482', '$ 522', '$ 168', '$ 899',
'$ 1,12', '$ 475', '$ 186', '$ 585', '$ 508', '$ 1,13', '$ 1,11',
'$ 303', '$ 1,10', '$ 611', '$ 251', '$ 896', '$ 296', '$ 583',
'$ 1,00', '$ 106', '$ 814', '$ 544', '$ 847', '$ 577', '$ 615',
'$ 598', '$ 299', '$ 806', '$ 60', '$ 986', '$ 463', '$ 339',
'$ 1,07', '$ 926', '$ 480', '$ 490', '$ 563', '$ 153', '$ 145',
'$ 350', '$ 728', '$ 536', '$ 969', '$ 948', '$ 448', '$ 609',
'$ 367', '$ 379', '$ 618', '$ 697', '$ 291', '$ 107', '$ 777',
'$ 377', '$ 628', '$ 968', '$ 344', '$ 1,05', '$ 617', '$ 1,18',
'$ 658', '$ 185', '$ 376', '$ 190', '$ 683', '$ 455', '$ 521',
'$ 238', '$ 827', '$ 129', '$ 237', '$ 975', '$ 327', '$ 437',
'$ 317', '$ 681', '$ 353', '$ 205', '$ 558', '$ 978', '$ 808',
'$ 823', '$ 695', '$ 295', '$ 1,09', '$ 916', '$ 793', '$ 837',
'$ 216', '$ 288', '$ 402', '$ 909', '$ 136', '$ 955', '$ 354',
'$ 464', '$ 368', '$ 627', '$ 52', '$ 956', '$ 471', '$ 110',
'$ 209', '$ 388', '$ 744', '$ 182', '$ 980', '$ 746', '$ 807',
'$ 659', '$ 571', '$ 517', '$ 535', '$ 323', '$ 735', '$ 967',
'$ 311', '$ 566', '$ 62', '$ 708', '$ 50', '$ 96', '$ 599',
'$ 614', '$ 499', '$ 734', '$ 54', '$ 588', '$ 792', '$ 698',
'$ 549', '$ 419', '$ 788', '$ 890', '$ 369', '$ 79', '$ 416',
'$ 991', '$ 759', '$ 753', '$ 709', '$ 979', '$ 414', '$ 973',
'$ 557', '$ 1,17', '$ 370', '$ 947', '$ 415', '$ 399', '$ 989',
'$ 450', '$ 882', '$ 398', '$ 800', '$ 766', '$ 886', '$ 334',
'$ 885', '$ 931', '$ 76', '$ 211', '$ 481', '$ 297', '$ 656',
'$ 717', '$ 946', '$ 258', '$ 412', '$ 400', '$ 266', '$ 171',
'$ 910', '$ 672', '$ 495', '$ 959', '$ 595', '$ 460', '$ 51',
'$ 278', '$ 420', '$ 81', '$ 912', '$ 927', '$ 214', '$ 257',
'$ 140', '$ 253', '$ 538', '$ 952', '$ 151', '$ 755', '$ 733',
'$ 473', '$ 873', '$ 998', '$ 78', '$ 572', '$ 383', '$ 846',
'$ 655', '$ 231', '$ 293', '$ 883', '$ 362', '$ 418', '$ 504',
'$ 653', '$ 895', '$ 879', '$ 228', '$ 976', '$ 265', '$ 310',
'$ 61', '$ 150', '$ 639', '$ 616', '$ 177', '$ 782', '$ 479',
'$ 325', '$ 130', '$ 425', '$ 502', '$ 645', '$ 856', '$ 165',
'$ 196', '$ 394', '$ 661', '$ 634', '$ 358', '$ 562', '$ 580',
'$ 305', '$ 783', '$ 631', '$ 731', '$ 722', '$ 207', '$ 467',
'$ 203', '$ 921', '$ 488', '$ 309', '$ 128', '$ 233', '$ 221',
'$ 514', '$ 605', '$ 849', '$ 762', '$ 942', '$ 691', '$ 355',
'$ 635', '$ 820', '$ 610', '$ 469', '$ 805', '$ 234', '$ 752',
'$ 511', '$ 917', '$ 246', '$ 529', '$ 75', '$ 676', '$ 93',
'$ 982', '$ 92', '$ 996', '$ 977', '$ 141', '$ 565', '$ 456',
'$ 888', '$ 561', '$ 364', '$ 650', '$ 184', '$ 270', '$ 804',
'$ 222', '$ 604', '$ 688', '$ 858', '$ 71', '$ 950', '$ 220',
'$ 798', '$ 747', '$ 934', '$ 626', '$ 133', '$ 539', '$ 91',
'$ 817', '$ 603', '$ 601', '$ 714', '$ 531', '$ 855', '$ 523',
'$ 392', '$ 189', '$ 174', '$ 342', '$ 657', '$ 810', '$ 204',
'$ 314', '$ 142', '$ 158', '$ 684', '$ 386', '$ 920', '$ 526',
'$ 519', '$ 131', '$ 848', '$ 452', '$ 864', '$ 719', '$ 533',
'$ 622', '$ 169', '$ 318', '$ 796', '$ 625', '$ 984', '$ 256',
'$ 772', '$ 1,20', '$ 87', '$ 638', '$ 407', '$ 758', '$ 892',
'$ 144', '$ 941', '$ 426', '$ 191', '$ 686', '$ 468', '$ 701',
'$ 951', '$ 505', '$ 797', '$ 496', '$ 943', '$ 834', '$ 700',
'$ 623', '$ 629', '$ 346', '$ 286', '$ 53', '$ 802', '$ 285',
'$ 192', '$ 356', '$ 385', '$ 543', '$ 953', '$ 181', '$ 590',
'$ 59', '$ 646', '$ 918', '$ 195', '$ 880', '$ 905', '$ 867',
'$ 282', '$ 763', '$ 640', '$ 429', '$ 461', '$ 328', '$ 567',
'$ 729', '$ 660', '$ 236', '$ 809', '$ 569', '$ 527', '$ 244',
'$ 861', '$ 243', '$ 933', '$ 673', '$ 872', '$ 713', '$ 881',
'$ 330', '$ 453', '$ 936', '$ 290', '$ 667', '$ 275', '$ 302',
'$ 606', '$ 741', '$ 108', '$ 581', '$ 971', '$ 408', '$ 170',
'$ 520', '$ 652', '$ 132', '$ 636', '$ 67', '$ 449', '$ 219',
'$ 756', '$ 540', '$ 778', '$ 101', '$ 438', '$ 732', '$ 785',
'$ 771', '$ 654', '$ 55', '$ 647', '$ 289', '$ 336', '$ 575',
'$ 69', '$ 210', '$ 341', '$ 199', '$ 126', '$ 662', '$ 351',
'$ 432', '$ 329', '$ 712', '$ 178', '$ 160', '$ 751', '$ 82',
'$ 380', '$ 57', '$ 149', '$ 637', '$ 97', '$ 423', '$ 924',
'$ 619', '$ 248', '$ 994', '$ 436', '$ 987', '$ 937', '$ 179',
'$ 694', '$ 573', '$ 724', '$ 853', '$ 372', '$ 65', '$ 218',
'$ 877', '$ 578', '$ 988', '$ 534', '$ 677', '$ 907', '$ 893',
'$ 903', '$ 716', '$ 484', '$ 898', '$ 842', '$ 343', '$ 312',
'$ 396', '$ 229', '$ 889', '$ 313', '$ 127', '$ 124', '$ 825',
'$ 434', '$ 670', '$ 935', '$ 489', '$ 850', '$ 929', '$ 441',
'$ 474', '$ 668', '$ 644', '$ 818', '$ 194', '$ 801', '$ 391',
'$ 845', '$ 922', '$ 574', '$ 280', '$ 427', '$ 206', '$ 515',
'$ 718', '$ 961', '$ 773', '$ 962', '$ 579', '$ 768', '$ 73',
'$ 498', '$ 679', '$ 525', '$ 345', '$ 387', '$ 146', '$ 649',
'$ 491', '$ 812', '$ 682', '$ 193', '$ 143', '$ 789', '$ 576',
'$ 742', '$ 938', '$ 868', '$ 643', '$ 347', '$ 835', '$ 117',
'$ 840', '$ 457', '$ 442', '$ 458', '$ 767', '$ 908', '$ 255',
'$ 608', '$ 547', '$ 58', '$ 913', '$ 95', '$ 878', '$ 570',
'$ 157', '$ 183', '$ 84', '$ 651', '$ 564', '$ 188', '$ 860',
'$ 162', '$ 267', '$ 803', '$ 963', '$ 56', '$ 770', '$ 584',
'$ 983', '$ 250', '$ 594', '$ 832', '$ 200', '$ 692', '$ 891',
'$ 260', '$ 338', '$ 454', '$ 208', '$ 114', '$ 743', '$ 830',
'$ 111', '$ 981', '$ 166', '$ 308', '$ 965', '$ 357', '$ 568',
'$ 261', '$ 704', '$ 247', '$ 262', '$ 363', '$ 319', '$ 954',
'$ 137', '$ 147', '$ 240', '$ 822', '$ 669', '$ 513', '$ 405',
'$ 268', '$ 897', '$ 148', '$ 109', '$ 699', '$ 582', '$ 559',
'$ 167', '$ 245', '$ 64', '$ 284', '$ 472', '$ 403', '$ 413',
'$ 711', '$ 790', '$ 102', '$ 315', '$ 765', '$ 985', '$ 478',
'$ 957', '$ 552', '$ 715', '$ 703', '$ 648', '$ 689', '$ 242',
'$ 945', '$ 340', '$ 687', '$ 591', '$ 745', '$ 374', '$ 138',
'$ 371', '$ 556', '$ 411', '$ 173', '$ 444', '$ 736', '$ 944',
'$ 831', '$ 613', '$ 993', '$ 239', '$ 844', '$ 674', '$ 542',
'$ 395', '$ 829', '$ 915', '$ 470', '$ 939', '$ 94', '$ 665',
'$ 506', '$ 730', '$ 869', '$ 187', '$ 120', '$ 726', '$ 966',
'$ 445', '$ 83', '$ 70', '$ 393', '$ 487', '$ 301', '$ 862',
'$ 721', '$ 86', '$ 999', '$ 493', '$ 560', '$ 116', '$ 139',
'$ 424', '$ 63', '$ 66', '$ 422', '$ 664', '$ 551', '$ 269',
'$ 923', '$ 451', '$ 225', '$ 919', '$ 273', '$ 620', '$ 349',
'$ 906', '$ 492', '$ 928', '$ 545', '$ 163', '$ 666', '$ 819',
'$ 201', '$ 995', '$ 740', '$ 300', '$ 894', '$ 528', '$ 720',
'$ 821', '$ 794', '$ 680', '$ 352', '$ 550', '$ 274', '$ 264',
'$ 737', '$ 390', '$ 911', '$ 761', '$ 901', '$ 161', '$ 98',
'$ 254', '$ 507', '$ 641', '$ 359', '$ 375', '$ 769', '$ 348',
'$ 972', '$ 675', '$ 693', '$ 597', '$ 159', '$ 932', '$ 324',
'$ 373', '$ 486', '$ 99', '$ 875', '$ 725', '$ 518', '$ 546',
'$ 586', '$ 824', '$ 621', '$ 904', '$ 417', '$ 89', '$ 554',
'$ 68', '$ 633', '$ 612', '$ 485', '$ 779', '$ 500', '$ 406',
'$ 642', '$ 263', '$ 587', '$ 105', '$ 100', '$ 217', '$ 705',
'$ 213', '$ 900', '$ 859', '$ 281', '$ 964', '$ 854', '$ 476',
'$ 990', '$ 366', '$ 930', '$ 970', '$ 630', '$ 335', '$ 409',
'$ 811', '$ 795', '$ 235', '$ 483', '$ 77', '$ 88', '$ 259',
'$ 857', '$ 202', '$ 361', '$ 710', '$ 663', '$ 134', '$ 884',
'$ 602', '$ 632', '$ 678', '$ 512', '$ 421', '$ 156', '$ 249',
'$ 439', '$ 298', '$ 477', '$ 447', '$ 510', '$ 440', '$ 382',
'$ 321', '$ 176', '$ 85', '$ 596', '$ 152', '$ 316', '$ 876',
'$ 839', '$ 118', '$ 757', '$ 600', '$ 624', '$ 702', '$ 852',
'$ 509', '$ 532', '$ 826', '$ 135', '$ 180', '$ 816', '$ 292',
'$ 365', '$ 696', '$ 738', '$ 843', '$ 871', '$ 435', '$ 524',
'$ 307', '$ 727', '$ 224', '$ 974', '$ 774', '$ 940', '$ 332',
'$ 764', '$ 252', '$ 333', '$ 791', '$ 401', '$ 154', '$ 503',
'$ 113', '$ 958', '$ 230', '$ 410', '$ 431', '$ 537', '$ 331',
'$ 706', '$ 459', '$ 750', '$ 775', '$ 516', '$ 754', '$ 548',
'$ 304', '$ 446', '$ 815', '$ 430', '$ 322', '$ 497', '$ 337',
'$ 841', '$ 690', '$ 227', '$ 226', '$ 530', '$ 433', '$ 74',
'$ 360', '$ 197', '$ 384', '$ 739', '$ 949', '$ 122', '$ 787',
'$ 914', '$ 378', '$ 870', '$ 320', '$ 123', '$ 776', '$ 241',
'$ 125', '$ 465', '$ 287', '$ 589', '$ 294', '$ 748', '$ -611',
'$ 254000'], dtype=object)
# Vemos que el problema esta con los signos pesos, algunos negativos y comas en lugar de puntos. Empezaremos quitando el signo
# y cambiando las compas por puntos. Tambien se cambiaran los nan por ceros.
LosAlpesHistorydf['price']=LosAlpesHistorydf['price'].str.slice_replace(stop=2, repl='')
LosAlpesHistorydf['price']=LosAlpesHistorydf['price'].replace(np.nan, '0.0')
LosAlpesHistorydf['price']=LosAlpesHistorydf['price'].replace(',', '.',regex=True)
LosAlpesHistorydf['price']=LosAlpesHistorydf['price'].astype(float, errors = 'raise')
LosAlpesHistorydf['price'].unique()
array([ 8.51e+02, 4.66e+02, 8.74e+02, 8.13e+02, 3.26e+02, 7.86e+02,
8.63e+02, 7.49e+02, 3.97e+02, 4.62e+02, 1.21e+02, 1.08e+00,
4.43e+02, 7.07e+02, 9.25e+02, 9.97e+02, 8.87e+02, 7.60e+02,
6.71e+02, 1.15e+00, 2.77e+02, 1.01e+00, 3.89e+02, 7.99e+02,
1.06e+00, 8.28e+02, 8.33e+02, 3.06e+02, 1.15e+02, 8.36e+02,
8.00e+01, 1.98e+02, 8.66e+02, 5.55e+02, 1.04e+00, 1.14e+00,
2.79e+02, 6.85e+02, 3.81e+02, 7.84e+02, 7.23e+02, 1.03e+02,
2.15e+02, 9.92e+02, 5.92e+02, 1.64e+02, 9.00e+01, 7.81e+02,
4.04e+02, 7.20e+01, 1.75e+02, 4.94e+02, 9.60e+02, 2.72e+02,
2.32e+02, 1.16e+00, 4.28e+02, 7.80e+02, 2.23e+02, 1.12e+02,
1.55e+02, 1.04e+02, 9.02e+02, 8.65e+02, 1.19e+02, 5.01e+02,
6.07e+02, 1.02e+00, 2.83e+02, 2.71e+02, 2.76e+02, 5.41e+02,
0.00e+00, 5.93e+02, 2.12e+02, 1.19e+00, 8.38e+02, 5.53e+02,
1.72e+02, 1.03e+00, 4.82e+02, 5.22e+02, 1.68e+02, 8.99e+02,
1.12e+00, 4.75e+02, 1.86e+02, 5.85e+02, 5.08e+02, 1.13e+00,
1.11e+00, 3.03e+02, 1.10e+00, 6.11e+02, 2.51e+02, 8.96e+02,
2.96e+02, 5.83e+02, 1.00e+00, 1.06e+02, 8.14e+02, 5.44e+02,
8.47e+02, 5.77e+02, 6.15e+02, 5.98e+02, 2.99e+02, 8.06e+02,
6.00e+01, 9.86e+02, 4.63e+02, 3.39e+02, 1.07e+00, 9.26e+02,
4.80e+02, 4.90e+02, 5.63e+02, 1.53e+02, 1.45e+02, 3.50e+02,
7.28e+02, 5.36e+02, 9.69e+02, 9.48e+02, 4.48e+02, 6.09e+02,
3.67e+02, 3.79e+02, 6.18e+02, 6.97e+02, 2.91e+02, 1.07e+02,
7.77e+02, 3.77e+02, 6.28e+02, 9.68e+02, 3.44e+02, 1.05e+00,
6.17e+02, 1.18e+00, 6.58e+02, 1.85e+02, 3.76e+02, 1.90e+02,
6.83e+02, 4.55e+02, 5.21e+02, 2.38e+02, 8.27e+02, 1.29e+02,
2.37e+02, 9.75e+02, 3.27e+02, 4.37e+02, 3.17e+02, 6.81e+02,
3.53e+02, 2.05e+02, 5.58e+02, 9.78e+02, 8.08e+02, 8.23e+02,
6.95e+02, 2.95e+02, 1.09e+00, 9.16e+02, 7.93e+02, 8.37e+02,
2.16e+02, 2.88e+02, 4.02e+02, 9.09e+02, 1.36e+02, 9.55e+02,
3.54e+02, 4.64e+02, 3.68e+02, 6.27e+02, 5.20e+01, 9.56e+02,
4.71e+02, 1.10e+02, 2.09e+02, 3.88e+02, 7.44e+02, 1.82e+02,
9.80e+02, 7.46e+02, 8.07e+02, 6.59e+02, 5.71e+02, 5.17e+02,
5.35e+02, 3.23e+02, 7.35e+02, 9.67e+02, 3.11e+02, 5.66e+02,
6.20e+01, 7.08e+02, 5.00e+01, 9.60e+01, 5.99e+02, 6.14e+02,
4.99e+02, 7.34e+02, 5.40e+01, 5.88e+02, 7.92e+02, 6.98e+02,
5.49e+02, 4.19e+02, 7.88e+02, 8.90e+02, 3.69e+02, 7.90e+01,
4.16e+02, 9.91e+02, 7.59e+02, 7.53e+02, 7.09e+02, 9.79e+02,
4.14e+02, 9.73e+02, 5.57e+02, 1.17e+00, 3.70e+02, 9.47e+02,
4.15e+02, 3.99e+02, 9.89e+02, 4.50e+02, 8.82e+02, 3.98e+02,
8.00e+02, 7.66e+02, 8.86e+02, 3.34e+02, 8.85e+02, 9.31e+02,
7.60e+01, 2.11e+02, 4.81e+02, 2.97e+02, 6.56e+02, 7.17e+02,
9.46e+02, 2.58e+02, 4.12e+02, 4.00e+02, 2.66e+02, 1.71e+02,
9.10e+02, 6.72e+02, 4.95e+02, 9.59e+02, 5.95e+02, 4.60e+02,
5.10e+01, 2.78e+02, 4.20e+02, 8.10e+01, 9.12e+02, 9.27e+02,
2.14e+02, 2.57e+02, 1.40e+02, 2.53e+02, 5.38e+02, 9.52e+02,
1.51e+02, 7.55e+02, 7.33e+02, 4.73e+02, 8.73e+02, 9.98e+02,
7.80e+01, 5.72e+02, 3.83e+02, 8.46e+02, 6.55e+02, 2.31e+02,
2.93e+02, 8.83e+02, 3.62e+02, 4.18e+02, 5.04e+02, 6.53e+02,
8.95e+02, 8.79e+02, 2.28e+02, 9.76e+02, 2.65e+02, 3.10e+02,
6.10e+01, 1.50e+02, 6.39e+02, 6.16e+02, 1.77e+02, 7.82e+02,
4.79e+02, 3.25e+02, 1.30e+02, 4.25e+02, 5.02e+02, 6.45e+02,
8.56e+02, 1.65e+02, 1.96e+02, 3.94e+02, 6.61e+02, 6.34e+02,
3.58e+02, 5.62e+02, 5.80e+02, 3.05e+02, 7.83e+02, 6.31e+02,
7.31e+02, 7.22e+02, 2.07e+02, 4.67e+02, 2.03e+02, 9.21e+02,
4.88e+02, 3.09e+02, 1.28e+02, 2.33e+02, 2.21e+02, 5.14e+02,
6.05e+02, 8.49e+02, 7.62e+02, 9.42e+02, 6.91e+02, 3.55e+02,
6.35e+02, 8.20e+02, 6.10e+02, 4.69e+02, 8.05e+02, 2.34e+02,
7.52e+02, 5.11e+02, 9.17e+02, 2.46e+02, 5.29e+02, 7.50e+01,
6.76e+02, 9.30e+01, 9.82e+02, 9.20e+01, 9.96e+02, 9.77e+02,
1.41e+02, 5.65e+02, 4.56e+02, 8.88e+02, 5.61e+02, 3.64e+02,
6.50e+02, 1.84e+02, 2.70e+02, 8.04e+02, 2.22e+02, 6.04e+02,
6.88e+02, 8.58e+02, 7.10e+01, 9.50e+02, 2.20e+02, 7.98e+02,
7.47e+02, 9.34e+02, 6.26e+02, 1.33e+02, 5.39e+02, 9.10e+01,
8.17e+02, 6.03e+02, 6.01e+02, 7.14e+02, 5.31e+02, 8.55e+02,
5.23e+02, 3.92e+02, 1.89e+02, 1.74e+02, 3.42e+02, 6.57e+02,
8.10e+02, 2.04e+02, 3.14e+02, 1.42e+02, 1.58e+02, 6.84e+02,
3.86e+02, 9.20e+02, 5.26e+02, 5.19e+02, 1.31e+02, 8.48e+02,
4.52e+02, 8.64e+02, 7.19e+02, 5.33e+02, 6.22e+02, 1.69e+02,
3.18e+02, 7.96e+02, 6.25e+02, 9.84e+02, 2.56e+02, 7.72e+02,
1.20e+00, 8.70e+01, 6.38e+02, 4.07e+02, 7.58e+02, 8.92e+02,
1.44e+02, 9.41e+02, 4.26e+02, 1.91e+02, 6.86e+02, 4.68e+02,
7.01e+02, 9.51e+02, 5.05e+02, 7.97e+02, 4.96e+02, 9.43e+02,
8.34e+02, 7.00e+02, 6.23e+02, 6.29e+02, 3.46e+02, 2.86e+02,
5.30e+01, 8.02e+02, 2.85e+02, 1.92e+02, 3.56e+02, 3.85e+02,
5.43e+02, 9.53e+02, 1.81e+02, 5.90e+02, 5.90e+01, 6.46e+02,
9.18e+02, 1.95e+02, 8.80e+02, 9.05e+02, 8.67e+02, 2.82e+02,
7.63e+02, 6.40e+02, 4.29e+02, 4.61e+02, 3.28e+02, 5.67e+02,
7.29e+02, 6.60e+02, 2.36e+02, 8.09e+02, 5.69e+02, 5.27e+02,
2.44e+02, 8.61e+02, 2.43e+02, 9.33e+02, 6.73e+02, 8.72e+02,
7.13e+02, 8.81e+02, 3.30e+02, 4.53e+02, 9.36e+02, 2.90e+02,
6.67e+02, 2.75e+02, 3.02e+02, 6.06e+02, 7.41e+02, 1.08e+02,
5.81e+02, 9.71e+02, 4.08e+02, 1.70e+02, 5.20e+02, 6.52e+02,
1.32e+02, 6.36e+02, 6.70e+01, 4.49e+02, 2.19e+02, 7.56e+02,
5.40e+02, 7.78e+02, 1.01e+02, 4.38e+02, 7.32e+02, 7.85e+02,
7.71e+02, 6.54e+02, 5.50e+01, 6.47e+02, 2.89e+02, 3.36e+02,
5.75e+02, 6.90e+01, 2.10e+02, 3.41e+02, 1.99e+02, 1.26e+02,
6.62e+02, 3.51e+02, 4.32e+02, 3.29e+02, 7.12e+02, 1.78e+02,
1.60e+02, 7.51e+02, 8.20e+01, 3.80e+02, 5.70e+01, 1.49e+02,
6.37e+02, 9.70e+01, 4.23e+02, 9.24e+02, 6.19e+02, 2.48e+02,
9.94e+02, 4.36e+02, 9.87e+02, 9.37e+02, 1.79e+02, 6.94e+02,
5.73e+02, 7.24e+02, 8.53e+02, 3.72e+02, 6.50e+01, 2.18e+02,
8.77e+02, 5.78e+02, 9.88e+02, 5.34e+02, 6.77e+02, 9.07e+02,
8.93e+02, 9.03e+02, 7.16e+02, 4.84e+02, 8.98e+02, 8.42e+02,
3.43e+02, 3.12e+02, 3.96e+02, 2.29e+02, 8.89e+02, 3.13e+02,
1.27e+02, 1.24e+02, 8.25e+02, 4.34e+02, 6.70e+02, 9.35e+02,
4.89e+02, 8.50e+02, 9.29e+02, 4.41e+02, 4.74e+02, 6.68e+02,
6.44e+02, 8.18e+02, 1.94e+02, 8.01e+02, 3.91e+02, 8.45e+02,
9.22e+02, 5.74e+02, 2.80e+02, 4.27e+02, 2.06e+02, 5.15e+02,
7.18e+02, 9.61e+02, 7.73e+02, 9.62e+02, 5.79e+02, 7.68e+02,
7.30e+01, 4.98e+02, 6.79e+02, 5.25e+02, 3.45e+02, 3.87e+02,
1.46e+02, 6.49e+02, 4.91e+02, 8.12e+02, 6.82e+02, 1.93e+02,
1.43e+02, 7.89e+02, 5.76e+02, 7.42e+02, 9.38e+02, 8.68e+02,
6.43e+02, 3.47e+02, 8.35e+02, 1.17e+02, 8.40e+02, 4.57e+02,
4.42e+02, 4.58e+02, 7.67e+02, 9.08e+02, 2.55e+02, 6.08e+02,
5.47e+02, 5.80e+01, 9.13e+02, 9.50e+01, 8.78e+02, 5.70e+02,
1.57e+02, 1.83e+02, 8.40e+01, 6.51e+02, 5.64e+02, 1.88e+02,
8.60e+02, 1.62e+02, 2.67e+02, 8.03e+02, 9.63e+02, 5.60e+01,
7.70e+02, 5.84e+02, 9.83e+02, 2.50e+02, 5.94e+02, 8.32e+02,
2.00e+02, 6.92e+02, 8.91e+02, 2.60e+02, 3.38e+02, 4.54e+02,
2.08e+02, 1.14e+02, 7.43e+02, 8.30e+02, 1.11e+02, 9.81e+02,
1.66e+02, 3.08e+02, 9.65e+02, 3.57e+02, 5.68e+02, 2.61e+02,
7.04e+02, 2.47e+02, 2.62e+02, 3.63e+02, 3.19e+02, 9.54e+02,
1.37e+02, 1.47e+02, 2.40e+02, 8.22e+02, 6.69e+02, 5.13e+02,
4.05e+02, 2.68e+02, 8.97e+02, 1.48e+02, 1.09e+02, 6.99e+02,
5.82e+02, 5.59e+02, 1.67e+02, 2.45e+02, 6.40e+01, 2.84e+02,
4.72e+02, 4.03e+02, 4.13e+02, 7.11e+02, 7.90e+02, 1.02e+02,
3.15e+02, 7.65e+02, 9.85e+02, 4.78e+02, 9.57e+02, 5.52e+02,
7.15e+02, 7.03e+02, 6.48e+02, 6.89e+02, 2.42e+02, 9.45e+02,
3.40e+02, 6.87e+02, 5.91e+02, 7.45e+02, 3.74e+02, 1.38e+02,
3.71e+02, 5.56e+02, 4.11e+02, 1.73e+02, 4.44e+02, 7.36e+02,
9.44e+02, 8.31e+02, 6.13e+02, 9.93e+02, 2.39e+02, 8.44e+02,
6.74e+02, 5.42e+02, 3.95e+02, 8.29e+02, 9.15e+02, 4.70e+02,
9.39e+02, 9.40e+01, 6.65e+02, 5.06e+02, 7.30e+02, 8.69e+02,
1.87e+02, 1.20e+02, 7.26e+02, 9.66e+02, 4.45e+02, 8.30e+01,
7.00e+01, 3.93e+02, 4.87e+02, 3.01e+02, 8.62e+02, 7.21e+02,
8.60e+01, 9.99e+02, 4.93e+02, 5.60e+02, 1.16e+02, 1.39e+02,
4.24e+02, 6.30e+01, 6.60e+01, 4.22e+02, 6.64e+02, 5.51e+02,
2.69e+02, 9.23e+02, 4.51e+02, 2.25e+02, 9.19e+02, 2.73e+02,
6.20e+02, 3.49e+02, 9.06e+02, 4.92e+02, 9.28e+02, 5.45e+02,
1.63e+02, 6.66e+02, 8.19e+02, 2.01e+02, 9.95e+02, 7.40e+02,
3.00e+02, 8.94e+02, 5.28e+02, 7.20e+02, 8.21e+02, 7.94e+02,
6.80e+02, 3.52e+02, 5.50e+02, 2.74e+02, 2.64e+02, 7.37e+02,
3.90e+02, 9.11e+02, 7.61e+02, 9.01e+02, 1.61e+02, 9.80e+01,
2.54e+02, 5.07e+02, 6.41e+02, 3.59e+02, 3.75e+02, 7.69e+02,
3.48e+02, 9.72e+02, 6.75e+02, 6.93e+02, 5.97e+02, 1.59e+02,
9.32e+02, 3.24e+02, 3.73e+02, 4.86e+02, 9.90e+01, 8.75e+02,
7.25e+02, 5.18e+02, 5.46e+02, 5.86e+02, 8.24e+02, 6.21e+02,
9.04e+02, 4.17e+02, 8.90e+01, 5.54e+02, 6.80e+01, 6.33e+02,
6.12e+02, 4.85e+02, 7.79e+02, 5.00e+02, 4.06e+02, 6.42e+02,
2.63e+02, 5.87e+02, 1.05e+02, 1.00e+02, 2.17e+02, 7.05e+02,
2.13e+02, 9.00e+02, 8.59e+02, 2.81e+02, 9.64e+02, 8.54e+02,
4.76e+02, 9.90e+02, 3.66e+02, 9.30e+02, 9.70e+02, 6.30e+02,
3.35e+02, 4.09e+02, 8.11e+02, 7.95e+02, 2.35e+02, 4.83e+02,
7.70e+01, 8.80e+01, 2.59e+02, 8.57e+02, 2.02e+02, 3.61e+02,
7.10e+02, 6.63e+02, 1.34e+02, 8.84e+02, 6.02e+02, 6.32e+02,
6.78e+02, 5.12e+02, 4.21e+02, 1.56e+02, 2.49e+02, 4.39e+02,
2.98e+02, 4.77e+02, 4.47e+02, 5.10e+02, 4.40e+02, 3.82e+02,
3.21e+02, 1.76e+02, 8.50e+01, 5.96e+02, 1.52e+02, 3.16e+02,
8.76e+02, 8.39e+02, 1.18e+02, 7.57e+02, 6.00e+02, 6.24e+02,
7.02e+02, 8.52e+02, 5.09e+02, 5.32e+02, 8.26e+02, 1.35e+02,
1.80e+02, 8.16e+02, 2.92e+02, 3.65e+02, 6.96e+02, 7.38e+02,
8.43e+02, 8.71e+02, 4.35e+02, 5.24e+02, 3.07e+02, 7.27e+02,
2.24e+02, 9.74e+02, 7.74e+02, 9.40e+02, 3.32e+02, 7.64e+02,
2.52e+02, 3.33e+02, 7.91e+02, 4.01e+02, 1.54e+02, 5.03e+02,
1.13e+02, 9.58e+02, 2.30e+02, 4.10e+02, 4.31e+02, 5.37e+02,
3.31e+02, 7.06e+02, 4.59e+02, 7.50e+02, 7.75e+02, 5.16e+02,
7.54e+02, 5.48e+02, 3.04e+02, 4.46e+02, 8.15e+02, 4.30e+02,
3.22e+02, 4.97e+02, 3.37e+02, 8.41e+02, 6.90e+02, 2.27e+02,
2.26e+02, 5.30e+02, 4.33e+02, 7.40e+01, 3.60e+02, 1.97e+02,
3.84e+02, 7.39e+02, 9.49e+02, 1.22e+02, 7.87e+02, 9.14e+02,
3.78e+02, 8.70e+02, 3.20e+02, 1.23e+02, 7.76e+02, 2.41e+02,
1.25e+02, 4.65e+02, 2.87e+02, 5.89e+02, 2.94e+02, 7.48e+02,
-6.11e+02, 2.54e+05])
def corregirNegativo(x):
if x > 0:
return x
elif x < 0:
return -1*x
else:
return 0
# Corregimos los negativos suponiendo que son problemas de digitación. Esto mismo lo haremos en las siguientes variables.
LosAlpesHistorydf['price']=LosAlpesHistorydf['price'].apply(corregirNegativo)
LosAlpesHistorydf['price'].unique()
array([8.51e+02, 4.66e+02, 8.74e+02, 8.13e+02, 3.26e+02, 7.86e+02,
8.63e+02, 7.49e+02, 3.97e+02, 4.62e+02, 1.21e+02, 1.08e+00,
4.43e+02, 7.07e+02, 9.25e+02, 9.97e+02, 8.87e+02, 7.60e+02,
6.71e+02, 1.15e+00, 2.77e+02, 1.01e+00, 3.89e+02, 7.99e+02,
1.06e+00, 8.28e+02, 8.33e+02, 3.06e+02, 1.15e+02, 8.36e+02,
8.00e+01, 1.98e+02, 8.66e+02, 5.55e+02, 1.04e+00, 1.14e+00,
2.79e+02, 6.85e+02, 3.81e+02, 7.84e+02, 7.23e+02, 1.03e+02,
2.15e+02, 9.92e+02, 5.92e+02, 1.64e+02, 9.00e+01, 7.81e+02,
4.04e+02, 7.20e+01, 1.75e+02, 4.94e+02, 9.60e+02, 2.72e+02,
2.32e+02, 1.16e+00, 4.28e+02, 7.80e+02, 2.23e+02, 1.12e+02,
1.55e+02, 1.04e+02, 9.02e+02, 8.65e+02, 1.19e+02, 5.01e+02,
6.07e+02, 1.02e+00, 2.83e+02, 2.71e+02, 2.76e+02, 5.41e+02,
0.00e+00, 5.93e+02, 2.12e+02, 1.19e+00, 8.38e+02, 5.53e+02,
1.72e+02, 1.03e+00, 4.82e+02, 5.22e+02, 1.68e+02, 8.99e+02,
1.12e+00, 4.75e+02, 1.86e+02, 5.85e+02, 5.08e+02, 1.13e+00,
1.11e+00, 3.03e+02, 1.10e+00, 6.11e+02, 2.51e+02, 8.96e+02,
2.96e+02, 5.83e+02, 1.00e+00, 1.06e+02, 8.14e+02, 5.44e+02,
8.47e+02, 5.77e+02, 6.15e+02, 5.98e+02, 2.99e+02, 8.06e+02,
6.00e+01, 9.86e+02, 4.63e+02, 3.39e+02, 1.07e+00, 9.26e+02,
4.80e+02, 4.90e+02, 5.63e+02, 1.53e+02, 1.45e+02, 3.50e+02,
7.28e+02, 5.36e+02, 9.69e+02, 9.48e+02, 4.48e+02, 6.09e+02,
3.67e+02, 3.79e+02, 6.18e+02, 6.97e+02, 2.91e+02, 1.07e+02,
7.77e+02, 3.77e+02, 6.28e+02, 9.68e+02, 3.44e+02, 1.05e+00,
6.17e+02, 1.18e+00, 6.58e+02, 1.85e+02, 3.76e+02, 1.90e+02,
6.83e+02, 4.55e+02, 5.21e+02, 2.38e+02, 8.27e+02, 1.29e+02,
2.37e+02, 9.75e+02, 3.27e+02, 4.37e+02, 3.17e+02, 6.81e+02,
3.53e+02, 2.05e+02, 5.58e+02, 9.78e+02, 8.08e+02, 8.23e+02,
6.95e+02, 2.95e+02, 1.09e+00, 9.16e+02, 7.93e+02, 8.37e+02,
2.16e+02, 2.88e+02, 4.02e+02, 9.09e+02, 1.36e+02, 9.55e+02,
3.54e+02, 4.64e+02, 3.68e+02, 6.27e+02, 5.20e+01, 9.56e+02,
4.71e+02, 1.10e+02, 2.09e+02, 3.88e+02, 7.44e+02, 1.82e+02,
9.80e+02, 7.46e+02, 8.07e+02, 6.59e+02, 5.71e+02, 5.17e+02,
5.35e+02, 3.23e+02, 7.35e+02, 9.67e+02, 3.11e+02, 5.66e+02,
6.20e+01, 7.08e+02, 5.00e+01, 9.60e+01, 5.99e+02, 6.14e+02,
4.99e+02, 7.34e+02, 5.40e+01, 5.88e+02, 7.92e+02, 6.98e+02,
5.49e+02, 4.19e+02, 7.88e+02, 8.90e+02, 3.69e+02, 7.90e+01,
4.16e+02, 9.91e+02, 7.59e+02, 7.53e+02, 7.09e+02, 9.79e+02,
4.14e+02, 9.73e+02, 5.57e+02, 1.17e+00, 3.70e+02, 9.47e+02,
4.15e+02, 3.99e+02, 9.89e+02, 4.50e+02, 8.82e+02, 3.98e+02,
8.00e+02, 7.66e+02, 8.86e+02, 3.34e+02, 8.85e+02, 9.31e+02,
7.60e+01, 2.11e+02, 4.81e+02, 2.97e+02, 6.56e+02, 7.17e+02,
9.46e+02, 2.58e+02, 4.12e+02, 4.00e+02, 2.66e+02, 1.71e+02,
9.10e+02, 6.72e+02, 4.95e+02, 9.59e+02, 5.95e+02, 4.60e+02,
5.10e+01, 2.78e+02, 4.20e+02, 8.10e+01, 9.12e+02, 9.27e+02,
2.14e+02, 2.57e+02, 1.40e+02, 2.53e+02, 5.38e+02, 9.52e+02,
1.51e+02, 7.55e+02, 7.33e+02, 4.73e+02, 8.73e+02, 9.98e+02,
7.80e+01, 5.72e+02, 3.83e+02, 8.46e+02, 6.55e+02, 2.31e+02,
2.93e+02, 8.83e+02, 3.62e+02, 4.18e+02, 5.04e+02, 6.53e+02,
8.95e+02, 8.79e+02, 2.28e+02, 9.76e+02, 2.65e+02, 3.10e+02,
6.10e+01, 1.50e+02, 6.39e+02, 6.16e+02, 1.77e+02, 7.82e+02,
4.79e+02, 3.25e+02, 1.30e+02, 4.25e+02, 5.02e+02, 6.45e+02,
8.56e+02, 1.65e+02, 1.96e+02, 3.94e+02, 6.61e+02, 6.34e+02,
3.58e+02, 5.62e+02, 5.80e+02, 3.05e+02, 7.83e+02, 6.31e+02,
7.31e+02, 7.22e+02, 2.07e+02, 4.67e+02, 2.03e+02, 9.21e+02,
4.88e+02, 3.09e+02, 1.28e+02, 2.33e+02, 2.21e+02, 5.14e+02,
6.05e+02, 8.49e+02, 7.62e+02, 9.42e+02, 6.91e+02, 3.55e+02,
6.35e+02, 8.20e+02, 6.10e+02, 4.69e+02, 8.05e+02, 2.34e+02,
7.52e+02, 5.11e+02, 9.17e+02, 2.46e+02, 5.29e+02, 7.50e+01,
6.76e+02, 9.30e+01, 9.82e+02, 9.20e+01, 9.96e+02, 9.77e+02,
1.41e+02, 5.65e+02, 4.56e+02, 8.88e+02, 5.61e+02, 3.64e+02,
6.50e+02, 1.84e+02, 2.70e+02, 8.04e+02, 2.22e+02, 6.04e+02,
6.88e+02, 8.58e+02, 7.10e+01, 9.50e+02, 2.20e+02, 7.98e+02,
7.47e+02, 9.34e+02, 6.26e+02, 1.33e+02, 5.39e+02, 9.10e+01,
8.17e+02, 6.03e+02, 6.01e+02, 7.14e+02, 5.31e+02, 8.55e+02,
5.23e+02, 3.92e+02, 1.89e+02, 1.74e+02, 3.42e+02, 6.57e+02,
8.10e+02, 2.04e+02, 3.14e+02, 1.42e+02, 1.58e+02, 6.84e+02,
3.86e+02, 9.20e+02, 5.26e+02, 5.19e+02, 1.31e+02, 8.48e+02,
4.52e+02, 8.64e+02, 7.19e+02, 5.33e+02, 6.22e+02, 1.69e+02,
3.18e+02, 7.96e+02, 6.25e+02, 9.84e+02, 2.56e+02, 7.72e+02,
1.20e+00, 8.70e+01, 6.38e+02, 4.07e+02, 7.58e+02, 8.92e+02,
1.44e+02, 9.41e+02, 4.26e+02, 1.91e+02, 6.86e+02, 4.68e+02,
7.01e+02, 9.51e+02, 5.05e+02, 7.97e+02, 4.96e+02, 9.43e+02,
8.34e+02, 7.00e+02, 6.23e+02, 6.29e+02, 3.46e+02, 2.86e+02,
5.30e+01, 8.02e+02, 2.85e+02, 1.92e+02, 3.56e+02, 3.85e+02,
5.43e+02, 9.53e+02, 1.81e+02, 5.90e+02, 5.90e+01, 6.46e+02,
9.18e+02, 1.95e+02, 8.80e+02, 9.05e+02, 8.67e+02, 2.82e+02,
7.63e+02, 6.40e+02, 4.29e+02, 4.61e+02, 3.28e+02, 5.67e+02,
7.29e+02, 6.60e+02, 2.36e+02, 8.09e+02, 5.69e+02, 5.27e+02,
2.44e+02, 8.61e+02, 2.43e+02, 9.33e+02, 6.73e+02, 8.72e+02,
7.13e+02, 8.81e+02, 3.30e+02, 4.53e+02, 9.36e+02, 2.90e+02,
6.67e+02, 2.75e+02, 3.02e+02, 6.06e+02, 7.41e+02, 1.08e+02,
5.81e+02, 9.71e+02, 4.08e+02, 1.70e+02, 5.20e+02, 6.52e+02,
1.32e+02, 6.36e+02, 6.70e+01, 4.49e+02, 2.19e+02, 7.56e+02,
5.40e+02, 7.78e+02, 1.01e+02, 4.38e+02, 7.32e+02, 7.85e+02,
7.71e+02, 6.54e+02, 5.50e+01, 6.47e+02, 2.89e+02, 3.36e+02,
5.75e+02, 6.90e+01, 2.10e+02, 3.41e+02, 1.99e+02, 1.26e+02,
6.62e+02, 3.51e+02, 4.32e+02, 3.29e+02, 7.12e+02, 1.78e+02,
1.60e+02, 7.51e+02, 8.20e+01, 3.80e+02, 5.70e+01, 1.49e+02,
6.37e+02, 9.70e+01, 4.23e+02, 9.24e+02, 6.19e+02, 2.48e+02,
9.94e+02, 4.36e+02, 9.87e+02, 9.37e+02, 1.79e+02, 6.94e+02,
5.73e+02, 7.24e+02, 8.53e+02, 3.72e+02, 6.50e+01, 2.18e+02,
8.77e+02, 5.78e+02, 9.88e+02, 5.34e+02, 6.77e+02, 9.07e+02,
8.93e+02, 9.03e+02, 7.16e+02, 4.84e+02, 8.98e+02, 8.42e+02,
3.43e+02, 3.12e+02, 3.96e+02, 2.29e+02, 8.89e+02, 3.13e+02,
1.27e+02, 1.24e+02, 8.25e+02, 4.34e+02, 6.70e+02, 9.35e+02,
4.89e+02, 8.50e+02, 9.29e+02, 4.41e+02, 4.74e+02, 6.68e+02,
6.44e+02, 8.18e+02, 1.94e+02, 8.01e+02, 3.91e+02, 8.45e+02,
9.22e+02, 5.74e+02, 2.80e+02, 4.27e+02, 2.06e+02, 5.15e+02,
7.18e+02, 9.61e+02, 7.73e+02, 9.62e+02, 5.79e+02, 7.68e+02,
7.30e+01, 4.98e+02, 6.79e+02, 5.25e+02, 3.45e+02, 3.87e+02,
1.46e+02, 6.49e+02, 4.91e+02, 8.12e+02, 6.82e+02, 1.93e+02,
1.43e+02, 7.89e+02, 5.76e+02, 7.42e+02, 9.38e+02, 8.68e+02,
6.43e+02, 3.47e+02, 8.35e+02, 1.17e+02, 8.40e+02, 4.57e+02,
4.42e+02, 4.58e+02, 7.67e+02, 9.08e+02, 2.55e+02, 6.08e+02,
5.47e+02, 5.80e+01, 9.13e+02, 9.50e+01, 8.78e+02, 5.70e+02,
1.57e+02, 1.83e+02, 8.40e+01, 6.51e+02, 5.64e+02, 1.88e+02,
8.60e+02, 1.62e+02, 2.67e+02, 8.03e+02, 9.63e+02, 5.60e+01,
7.70e+02, 5.84e+02, 9.83e+02, 2.50e+02, 5.94e+02, 8.32e+02,
2.00e+02, 6.92e+02, 8.91e+02, 2.60e+02, 3.38e+02, 4.54e+02,
2.08e+02, 1.14e+02, 7.43e+02, 8.30e+02, 1.11e+02, 9.81e+02,
1.66e+02, 3.08e+02, 9.65e+02, 3.57e+02, 5.68e+02, 2.61e+02,
7.04e+02, 2.47e+02, 2.62e+02, 3.63e+02, 3.19e+02, 9.54e+02,
1.37e+02, 1.47e+02, 2.40e+02, 8.22e+02, 6.69e+02, 5.13e+02,
4.05e+02, 2.68e+02, 8.97e+02, 1.48e+02, 1.09e+02, 6.99e+02,
5.82e+02, 5.59e+02, 1.67e+02, 2.45e+02, 6.40e+01, 2.84e+02,
4.72e+02, 4.03e+02, 4.13e+02, 7.11e+02, 7.90e+02, 1.02e+02,
3.15e+02, 7.65e+02, 9.85e+02, 4.78e+02, 9.57e+02, 5.52e+02,
7.15e+02, 7.03e+02, 6.48e+02, 6.89e+02, 2.42e+02, 9.45e+02,
3.40e+02, 6.87e+02, 5.91e+02, 7.45e+02, 3.74e+02, 1.38e+02,
3.71e+02, 5.56e+02, 4.11e+02, 1.73e+02, 4.44e+02, 7.36e+02,
9.44e+02, 8.31e+02, 6.13e+02, 9.93e+02, 2.39e+02, 8.44e+02,
6.74e+02, 5.42e+02, 3.95e+02, 8.29e+02, 9.15e+02, 4.70e+02,
9.39e+02, 9.40e+01, 6.65e+02, 5.06e+02, 7.30e+02, 8.69e+02,
1.87e+02, 1.20e+02, 7.26e+02, 9.66e+02, 4.45e+02, 8.30e+01,
7.00e+01, 3.93e+02, 4.87e+02, 3.01e+02, 8.62e+02, 7.21e+02,
8.60e+01, 9.99e+02, 4.93e+02, 5.60e+02, 1.16e+02, 1.39e+02,
4.24e+02, 6.30e+01, 6.60e+01, 4.22e+02, 6.64e+02, 5.51e+02,
2.69e+02, 9.23e+02, 4.51e+02, 2.25e+02, 9.19e+02, 2.73e+02,
6.20e+02, 3.49e+02, 9.06e+02, 4.92e+02, 9.28e+02, 5.45e+02,
1.63e+02, 6.66e+02, 8.19e+02, 2.01e+02, 9.95e+02, 7.40e+02,
3.00e+02, 8.94e+02, 5.28e+02, 7.20e+02, 8.21e+02, 7.94e+02,
6.80e+02, 3.52e+02, 5.50e+02, 2.74e+02, 2.64e+02, 7.37e+02,
3.90e+02, 9.11e+02, 7.61e+02, 9.01e+02, 1.61e+02, 9.80e+01,
2.54e+02, 5.07e+02, 6.41e+02, 3.59e+02, 3.75e+02, 7.69e+02,
3.48e+02, 9.72e+02, 6.75e+02, 6.93e+02, 5.97e+02, 1.59e+02,
9.32e+02, 3.24e+02, 3.73e+02, 4.86e+02, 9.90e+01, 8.75e+02,
7.25e+02, 5.18e+02, 5.46e+02, 5.86e+02, 8.24e+02, 6.21e+02,
9.04e+02, 4.17e+02, 8.90e+01, 5.54e+02, 6.80e+01, 6.33e+02,
6.12e+02, 4.85e+02, 7.79e+02, 5.00e+02, 4.06e+02, 6.42e+02,
2.63e+02, 5.87e+02, 1.05e+02, 1.00e+02, 2.17e+02, 7.05e+02,
2.13e+02, 9.00e+02, 8.59e+02, 2.81e+02, 9.64e+02, 8.54e+02,
4.76e+02, 9.90e+02, 3.66e+02, 9.30e+02, 9.70e+02, 6.30e+02,
3.35e+02, 4.09e+02, 8.11e+02, 7.95e+02, 2.35e+02, 4.83e+02,
7.70e+01, 8.80e+01, 2.59e+02, 8.57e+02, 2.02e+02, 3.61e+02,
7.10e+02, 6.63e+02, 1.34e+02, 8.84e+02, 6.02e+02, 6.32e+02,
6.78e+02, 5.12e+02, 4.21e+02, 1.56e+02, 2.49e+02, 4.39e+02,
2.98e+02, 4.77e+02, 4.47e+02, 5.10e+02, 4.40e+02, 3.82e+02,
3.21e+02, 1.76e+02, 8.50e+01, 5.96e+02, 1.52e+02, 3.16e+02,
8.76e+02, 8.39e+02, 1.18e+02, 7.57e+02, 6.00e+02, 6.24e+02,
7.02e+02, 8.52e+02, 5.09e+02, 5.32e+02, 8.26e+02, 1.35e+02,
1.80e+02, 8.16e+02, 2.92e+02, 3.65e+02, 6.96e+02, 7.38e+02,
8.43e+02, 8.71e+02, 4.35e+02, 5.24e+02, 3.07e+02, 7.27e+02,
2.24e+02, 9.74e+02, 7.74e+02, 9.40e+02, 3.32e+02, 7.64e+02,
2.52e+02, 3.33e+02, 7.91e+02, 4.01e+02, 1.54e+02, 5.03e+02,
1.13e+02, 9.58e+02, 2.30e+02, 4.10e+02, 4.31e+02, 5.37e+02,
3.31e+02, 7.06e+02, 4.59e+02, 7.50e+02, 7.75e+02, 5.16e+02,
7.54e+02, 5.48e+02, 3.04e+02, 4.46e+02, 8.15e+02, 4.30e+02,
3.22e+02, 4.97e+02, 3.37e+02, 8.41e+02, 6.90e+02, 2.27e+02,
2.26e+02, 5.30e+02, 4.33e+02, 7.40e+01, 3.60e+02, 1.97e+02,
3.84e+02, 7.39e+02, 9.49e+02, 1.22e+02, 7.87e+02, 9.14e+02,
3.78e+02, 8.70e+02, 3.20e+02, 1.23e+02, 7.76e+02, 2.41e+02,
1.25e+02, 4.65e+02, 2.87e+02, 5.89e+02, 2.94e+02, 7.48e+02,
2.54e+05])
# Limpieza #8: Vamos a aplicar lo mismo con el campo service Fee aunque nos ahorramos el split al no tener decimales.
# Toca volver a hacer la corrección de negativos para el fee negativo y convertir el nan en 0
LosAlpesHistorydf['service fee'].unique()
array(['$ 170', '$ 93', '$ 175', '$ 163', '$ 65', '$ 157', '$ 173',
'$ 150', '$ 79', '$ 92', '$ 24', '$ 216', '$ 89', '$ 141', '$ 185',
'$ 199', '$ 177', '$ 152', '$ 134', '$ 229', '$ 55', '$ 203',
'$ 78', '$ 160', '$ 212', '$ 166', '$ 167', '$ 61', '$ 23',
'$ 230', '$ 16', '$ 40', '$ 111', '$ 207', '$ 56', '$ 137', '$ 76',
'$ 145', '$ 21', '$ 43', '$ 198', '$ 118', '$ 33', '$ 18', '$ 156',
'$ 81', '$ 14', '$ 35', '$ 99', '$ 227', '$ 192', '$ 54', '$ 46',
'$ 232', '$ 86', '$ 45', '$ 22', '$ 31', '$ 180', '$ 100', '$ 121',
'$ 204', '$ 57', '$ 108', nan, '$ 119', '$ 42', '$ 238', '$ 168',
'$ 34', '$ 206', '$ 96', '$ 104', '$ 233', '$ 224', '$ 95', '$ 37',
'$ 117', '$ 102', '$ 226', '$ 221', '$ 219', '$ 122', '$ 50',
'$ 179', '$ 220', '$ 59', '$ 200', '$ 109', '$ 169', '$ 115',
'$ 123', '$ 120', '$ 60', '$ 161', '$ 12', '$ 197', '$ 68',
'$ 213', '$ 98', '$ 113', '$ 29', '$ 70', '$ 146', '$ 107',
'$ 194', '$ 190', '$ 90', '$ 73', '$ 124', '$ 231', '$ 139',
'$ 58', '$ 155', '$ 75', '$ 126', '$ 69', '$ 210', '$ 236',
'$ 132', '$ 38', '$ 91', '$ 48', '$ 165', '$ 26', '$ 47', '$ 195',
'$ 87', '$ 63', '$ 136', '$ 71', '$ 41', '$ 112', '$ 196', '$ 162',
'$ 218', '$ 183', '$ 159', '$ 80', '$ 182', '$ 27', '$ 191',
'$ 74', '$ 125', '$ 10', '$ 208', '$ 94', '$ 149', '$ 36', '$ 114',
'$ 103', '$ 147', '$ 193', '$ 209', '$ 142', '$ 19', '$ 11',
'$ 158', '$ 140', '$ 110', '$ 84', '$ 178', '$ 83', '$ 151',
'$ 234', '$ 189', '$ 176', '$ 237', '$ 153', '$ 67', '$ 186',
'$ 15', '$ 205', '$ 131', '$ 143', '$ 52', '$ 82', '$ 53', '$ 51',
'$ 28', '$ 30', '$ 77', '$ 72', '$ 101', '$ 225', '$ 62', '$ 128',
'$ 217', '$ 85', '$ 129', '$ 171', '$ 39', '$ 222', '$ 127',
'$ 116', '$ 144', '$ 184', '$ 44', '$ 188', '$ 138', '$ 164',
'$ 49', '$ 211', '$ 106', '$ 135', '$ 223', '$ 130', '$ 172',
'$ 187', '$ 105', '$ 32', '$ 215', '$ 64', '$ 154', '$ 239',
'$ 17', '$ 228', '$ 181', '$ 66', '$ 174', '$ 133', '$ 148',
'$ 13', '$ 214', '$ 201', '$ 20', '$ 88', '$ 235', '$ 25', '$ 240',
'$ 97', '$ 202', '$ 122000', '$ -193'], dtype=object)
LosAlpesHistorydf['service fee']=LosAlpesHistorydf['service fee'].str.slice_replace(stop=2, repl='')
LosAlpesHistorydf['service fee']=LosAlpesHistorydf['service fee'].replace(np.nan, '0.0')
LosAlpesHistorydf['service fee']=LosAlpesHistorydf['service fee'].replace(',', '.',regex=True)
LosAlpesHistorydf['service fee']=LosAlpesHistorydf['service fee'].astype(float, errors = 'raise')
LosAlpesHistorydf['service fee']=LosAlpesHistorydf['service fee'].apply(corregirNegativo)
LosAlpesHistorydf['service fee'].unique()
array([1.70e+02, 9.30e+01, 1.75e+02, 1.63e+02, 6.50e+01, 1.57e+02,
1.73e+02, 1.50e+02, 7.90e+01, 9.20e+01, 2.40e+01, 2.16e+02,
8.90e+01, 1.41e+02, 1.85e+02, 1.99e+02, 1.77e+02, 1.52e+02,
1.34e+02, 2.29e+02, 5.50e+01, 2.03e+02, 7.80e+01, 1.60e+02,
2.12e+02, 1.66e+02, 1.67e+02, 6.10e+01, 2.30e+01, 2.30e+02,
1.60e+01, 4.00e+01, 1.11e+02, 2.07e+02, 5.60e+01, 1.37e+02,
7.60e+01, 1.45e+02, 2.10e+01, 4.30e+01, 1.98e+02, 1.18e+02,
3.30e+01, 1.80e+01, 1.56e+02, 8.10e+01, 1.40e+01, 3.50e+01,
9.90e+01, 2.27e+02, 1.92e+02, 5.40e+01, 4.60e+01, 2.32e+02,
8.60e+01, 4.50e+01, 2.20e+01, 3.10e+01, 1.80e+02, 1.00e+02,
1.21e+02, 2.04e+02, 5.70e+01, 1.08e+02, 0.00e+00, 1.19e+02,
4.20e+01, 2.38e+02, 1.68e+02, 3.40e+01, 2.06e+02, 9.60e+01,
1.04e+02, 2.33e+02, 2.24e+02, 9.50e+01, 3.70e+01, 1.17e+02,
1.02e+02, 2.26e+02, 2.21e+02, 2.19e+02, 1.22e+02, 5.00e+01,
1.79e+02, 2.20e+02, 5.90e+01, 2.00e+02, 1.09e+02, 1.69e+02,
1.15e+02, 1.23e+02, 1.20e+02, 6.00e+01, 1.61e+02, 1.20e+01,
1.97e+02, 6.80e+01, 2.13e+02, 9.80e+01, 1.13e+02, 2.90e+01,
7.00e+01, 1.46e+02, 1.07e+02, 1.94e+02, 1.90e+02, 9.00e+01,
7.30e+01, 1.24e+02, 2.31e+02, 1.39e+02, 5.80e+01, 1.55e+02,
7.50e+01, 1.26e+02, 6.90e+01, 2.10e+02, 2.36e+02, 1.32e+02,
3.80e+01, 9.10e+01, 4.80e+01, 1.65e+02, 2.60e+01, 4.70e+01,
1.95e+02, 8.70e+01, 6.30e+01, 1.36e+02, 7.10e+01, 4.10e+01,
1.12e+02, 1.96e+02, 1.62e+02, 2.18e+02, 1.83e+02, 1.59e+02,
8.00e+01, 1.82e+02, 2.70e+01, 1.91e+02, 7.40e+01, 1.25e+02,
1.00e+01, 2.08e+02, 9.40e+01, 1.49e+02, 3.60e+01, 1.14e+02,
1.03e+02, 1.47e+02, 1.93e+02, 2.09e+02, 1.42e+02, 1.90e+01,
1.10e+01, 1.58e+02, 1.40e+02, 1.10e+02, 8.40e+01, 1.78e+02,
8.30e+01, 1.51e+02, 2.34e+02, 1.89e+02, 1.76e+02, 2.37e+02,
1.53e+02, 6.70e+01, 1.86e+02, 1.50e+01, 2.05e+02, 1.31e+02,
1.43e+02, 5.20e+01, 8.20e+01, 5.30e+01, 5.10e+01, 2.80e+01,
3.00e+01, 7.70e+01, 7.20e+01, 1.01e+02, 2.25e+02, 6.20e+01,
1.28e+02, 2.17e+02, 8.50e+01, 1.29e+02, 1.71e+02, 3.90e+01,
2.22e+02, 1.27e+02, 1.16e+02, 1.44e+02, 1.84e+02, 4.40e+01,
1.88e+02, 1.38e+02, 1.64e+02, 4.90e+01, 2.11e+02, 1.06e+02,
1.35e+02, 2.23e+02, 1.30e+02, 1.72e+02, 1.87e+02, 1.05e+02,
3.20e+01, 2.15e+02, 6.40e+01, 1.54e+02, 2.39e+02, 1.70e+01,
2.28e+02, 1.81e+02, 6.60e+01, 1.74e+02, 1.33e+02, 1.48e+02,
1.30e+01, 2.14e+02, 2.01e+02, 2.00e+01, 8.80e+01, 2.35e+02,
2.50e+01, 2.40e+02, 9.70e+01, 2.02e+02, 1.22e+05])
# Limpieza #9: para los valores numericos vamos a utilizar la función corregir negativo. Esto partiendo del hecho de que si existen valores negativos serán errores
# de digitación (los volveremos positivos) y si hay un NaN quedará en cero.
LosAlpesHistorydf['service fee']=LosAlpesHistorydf['service fee'].apply(corregirNegativo)
LosAlpesHistorydf['minimum nights']=LosAlpesHistorydf['minimum nights'].apply(corregirNegativo)
LosAlpesHistorydf['availability 365']=LosAlpesHistorydf['availability 365'].apply(corregirNegativo)
LosAlpesHistorydf['number of reviews']=LosAlpesHistorydf['number of reviews'].apply(corregirNegativo)
LosAlpesHistorydf['review rate number']=LosAlpesHistorydf['review rate number'].apply(corregirNegativo)
# Limpieza #10: Para el año de construcción, esperarmos outlayers y valores nan. Depende de lo poco logicos de los valores se tomarán acciones.
LosAlpesHistorydf['construction year'].unique()
array([2011., 2021., 2004., 2012., 2007., 2022., 2008., 2009., 2006.,
2003., 2014., 2005., 2020., 2015., 2016., 2013., 2018., 2017.,
2019., 2010., nan, 1020., 1022.])
def corregirAño(x):
if x > 2002:
return x
elif x <= 1022:
return x + 1000
else:
return 2002
LosAlpesHistorydf['construction year']=LosAlpesHistorydf['construction year'].apply(corregirAño)
LosAlpesHistorydf['construction year'].unique()
array([2011., 2021., 2004., 2012., 2007., 2022., 2008., 2009., 2006.,
2003., 2014., 2005., 2020., 2015., 2016., 2013., 2018., 2017.,
2019., 2010., 2002.])
# Limpieza #11: Se crea una función para corregir las disponibilidades mayores a 365 tomando como base que ningun sitio puede tener mas de 365 de disponibilidad en un año.
def corregirDisponibilidad(x):
if x < 366:
return x
elif x >= 366:
return 365
else:
return 365
LosAlpesHistorydf['availability 365']=LosAlpesHistorydf['availability 365'].apply(corregirDisponibilidad)
# Limpieza #12: por ultimo, vamos a quitar duplicados.
LosAlpesHistorydf.drop_duplicates(keep='first', inplace=True)
LosAlpesHistorydf.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 101547 entries, 0 to 102082 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 101547 non-null int64 1 neighbourhood group 101547 non-null object 2 neighbourhood 100946 non-null object 3 lat 100954 non-null float64 4 long 100954 non-null float64 5 country 101547 non-null object 6 instant_bookable 101547 non-null bool 7 cancellation_policy 101547 non-null object 8 room type 101547 non-null object 9 construction year 101547 non-null float64 10 price 101547 non-null float64 11 service fee 101547 non-null float64 12 minimum nights 101547 non-null float64 13 availability 365 101547 non-null float64 14 number of reviews 101547 non-null float64 15 review rate number 101547 non-null float64 dtypes: bool(1), float64(9), int64(1), object(5) memory usage: 12.5+ MB
# Posterior a la limpieza de datos, vamos a hacer un pequeño analisis bivariado tomando como base la zona o grupo de vecindario y el numero de reviews
# esto me podrá dar luz sobre nuevas varias (numero de reviews por apartamento en la zona) y así tener mayor conocimiento del comportamiento de reviews
# en el set de datos.
df_grafica= LosAlpesHistorydf[['neighbourhood group','number of reviews']].groupby(['neighbourhood group']).sum('number of reviews')
df_grafica=df_grafica.sort_values('neighbourhood group', ascending=True).reset_index()
df_grafica.plot('neighbourhood group','number of reviews', kind = 'bar')
<AxesSubplot:xlabel='neighbourhood group'>
df_grafica2 = LosAlpesHistorydf[['neighbourhood group']].groupby(['neighbourhood group']).size().reset_index(name='num_registers')
df_grafica2=df_grafica2.sort_values('neighbourhood group', ascending=True).reset_index()
df_grafica2.plot('neighbourhood group','num_registers', kind = 'bar')
<AxesSubplot:xlabel='neighbourhood group'>
df_grafica['num_registers']=df_grafica2['num_registers']
df_grafica['Average Review By Location']=df_grafica['number of reviews']/df_grafica['num_registers']
df_grafica=df_grafica.sort_values('Average Review By Location', ascending=False)
df_grafica.plot('neighbourhood group','Average Review By Location', kind = 'bar')
<AxesSubplot:xlabel='neighbourhood group'>
# Con esta relación entendemos que el numero promedio de review por zona no varia demasiado entre Bronx, Queens y State Island.
df_grafica
| neighbourhood group | number of reviews | num_registers | Average Review By Location | |
|---|---|---|---|---|
| 5 | State Island | 33428.0 | 937 | 35.675560 |
| 4 | Queens | 435165.0 | 12952 | 33.598286 |
| 0 | Bronx | 83639.0 | 2649 | 31.573801 |
| 1 | Brooklyn | 1172642.0 | 41236 | 28.437336 |
| 2 | Manhattan | 1038884.0 | 43159 | 24.071086 |
| 3 | No Information | 3048.0 | 614 | 4.964169 |
df_grafica3 = LosAlpesHistorydf[['availability 365','number of reviews']].groupby(['availability 365']).sum('number of reviews')
df_grafica3 = df_grafica3.sort_values('availability 365', ascending=True).reset_index()
df_grafica4 = LosAlpesHistorydf[['availability 365']].groupby(['availability 365']).size().reset_index(name='num_registers')
df_grafica4 = df_grafica4.sort_values('availability 365', ascending=True).reset_index()
df_grafica3=df_grafica3.reset_index()
df_grafica3['num_registers']=df_grafica4['num_registers']
df_grafica3['Average by Day']=df_grafica3['number of reviews']/df_grafica3['num_registers']
df_grafica3
| index | availability 365 | number of reviews | num_registers | Average by Day | |
|---|---|---|---|---|---|
| 0 | 0 | 0.0 | 253266.0 | 24283 | 10.429766 |
| 1 | 1 | 1.0 | 22446.0 | 774 | 29.000000 |
| 2 | 2 | 2.0 | 11801.0 | 533 | 22.140713 |
| 3 | 3 | 3.0 | 16354.0 | 614 | 26.635179 |
| 4 | 4 | 4.0 | 11680.0 | 474 | 24.641350 |
| ... | ... | ... | ... | ... | ... |
| 361 | 361 | 361.0 | 11279.0 | 276 | 40.865942 |
| 362 | 362 | 362.0 | 8486.0 | 375 | 22.629333 |
| 363 | 363 | 363.0 | 9571.0 | 500 | 19.142000 |
| 364 | 364 | 364.0 | 17439.0 | 1153 | 15.124892 |
| 365 | 365 | 365.0 | 129927.0 | 5178 | 25.092121 |
366 rows × 5 columns
# PUNTO 2: Muestre los resultados tras el entrenamiento y selección del mejor modelo obtenido a partir de los datos suministrados. Utilice las
# técnicas de transformación de datos, selección de atributos y regularización que considere convenientes. Explique cómo este modelo responde al
# objetivo planteado y si los resultados serían, de acuerdo a su criterio, suficientes para su uso por parte de la inmobiliaria. Recuerde argumentar
# sus decisiones con base en la interpretación cuantitativa y cualitativa de los resultados.
# Primero vamos a preparar el modelo para su entrenamiento
enc = OneHotEncoder(handle_unknown='ignore')
salida = enc.fit(LosAlpesHistorydf)
# Posterior a esto transformamos los datos. Country no lo hacemos por tener un solo valor.
le = LabelEncoder()
LosAlpesHistorydf['neighbourhood group encoded']=le.fit_transform(LosAlpesHistorydf['neighbourhood group'])
LosAlpesHistorydf['neighbourhood encoded']=le.fit_transform(LosAlpesHistorydf['neighbourhood'])
LosAlpesHistorydf['cancellation_policy encoded']=le.fit_transform(LosAlpesHistorydf['cancellation_policy'])
LosAlpesHistorydf['room type encoded']=le.fit_transform(LosAlpesHistorydf['room type'])
LosAlpesHistorydf.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 101547 entries, 0 to 102082 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 101547 non-null int64 1 neighbourhood group 101547 non-null object 2 neighbourhood 100946 non-null object 3 lat 100954 non-null float64 4 long 100954 non-null float64 5 country 101547 non-null object 6 instant_bookable 101547 non-null bool 7 cancellation_policy 101547 non-null object 8 room type 101547 non-null object 9 construction year 101547 non-null float64 10 price 101547 non-null float64 11 service fee 101547 non-null float64 12 minimum nights 101547 non-null float64 13 availability 365 101547 non-null float64 14 number of reviews 101547 non-null float64 15 review rate number 101547 non-null float64 16 neighbourhood group encoded 101547 non-null int32 17 neighbourhood encoded 101547 non-null int32 18 cancellation_policy encoded 101547 non-null int32 19 room type encoded 101547 non-null int32 dtypes: bool(1), float64(9), int32(4), int64(1), object(5) memory usage: 14.0+ MB
# En este caso quité columnas id, latitud, longitud, país (todo es EEUU) que no aportan mayor valor al analisis. Luego revisaremos la correlación entre variables
# para definir que otras variables aportan poco valor.
LosAlpesHistoryDFV2 = LosAlpesHistorydf[['instant_bookable', 'room type encoded','construction year','price','service fee','minimum nights','availability 365','number of reviews','neighbourhood group encoded','cancellation_policy encoded','neighbourhood encoded']]
LosAlpesHistoryDFV2.drop_duplicates(keep='first', inplace=True)
LosAlpesHistoryDFV2.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 94973 entries, 0 to 102082 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 instant_bookable 94973 non-null bool 1 room type encoded 94973 non-null int32 2 construction year 94973 non-null float64 3 price 94973 non-null float64 4 service fee 94973 non-null float64 5 minimum nights 94973 non-null float64 6 availability 365 94973 non-null float64 7 number of reviews 94973 non-null float64 8 neighbourhood group encoded 94973 non-null int32 9 cancellation_policy encoded 94973 non-null int32 10 neighbourhood encoded 94973 non-null int32 dtypes: bool(1), float64(6), int32(4) memory usage: 6.6 MB
LosAlpesHistoryDFV2.corr()
| instant_bookable | room type encoded | construction year | price | service fee | minimum nights | availability 365 | number of reviews | neighbourhood group encoded | cancellation_policy encoded | neighbourhood encoded | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| instant_bookable | 1.000000 | 0.000588 | 0.002866 | -0.001680 | 0.003341 | -0.003025 | -0.003863 | 0.000672 | -0.002043 | -0.003941 | 0.007372 |
| room type encoded | 0.000588 | 1.000000 | 0.005816 | 0.003934 | 0.003286 | -0.054535 | -0.009870 | -0.001695 | 0.026484 | 0.003465 | -0.069890 |
| construction year | 0.002866 | 0.005816 | 1.000000 | -0.005168 | -0.005040 | -0.002781 | -0.006711 | 0.000730 | -0.003943 | 0.005685 | -0.001876 |
| price | -0.001680 | 0.003934 | -0.005168 | 1.000000 | 0.010493 | -0.002520 | 0.003136 | -0.000974 | -0.001183 | 0.003545 | 0.001896 |
| service fee | 0.003341 | 0.003286 | -0.005040 | 0.010493 | 1.000000 | 0.000070 | -0.003929 | -0.000802 | -0.002356 | -0.000335 | -0.004418 |
| minimum nights | -0.003025 | -0.054535 | -0.002781 | -0.002520 | 0.000070 | 1.000000 | 0.056980 | -0.047902 | -0.001105 | -0.000551 | 0.013433 |
| availability 365 | -0.003863 | -0.009870 | -0.006711 | 0.003136 | -0.003929 | 0.056980 | 1.000000 | 0.093556 | 0.066280 | 0.000912 | -0.016052 |
| number of reviews | 0.000672 | -0.001695 | 0.000730 | -0.000974 | -0.000802 | -0.047902 | 0.093556 | 1.000000 | 0.022066 | -0.002430 | -0.044921 |
| neighbourhood group encoded | -0.002043 | 0.026484 | -0.003943 | -0.001183 | -0.002356 | -0.001105 | 0.066280 | 0.022066 | 1.000000 | 0.002763 | 0.082672 |
| cancellation_policy encoded | -0.003941 | 0.003465 | 0.005685 | 0.003545 | -0.000335 | -0.000551 | 0.000912 | -0.002430 | 0.002763 | 1.000000 | 0.003253 |
| neighbourhood encoded | 0.007372 | -0.069890 | -0.001876 | 0.001896 | -0.004418 | 0.013433 | -0.016052 | -0.044921 | 0.082672 | 0.003253 | 1.000000 |
# Se logra evidenciar que no hay variables altamente relacionadas a la variable Number of Reviews. Vamos a seleccionar las que esten en estén entre
# de 0.0099999 y -0.0099999 para el modelo de regressión Lasso (necesita pocas variales). El modelo completo será usado para el modelo de regresión
# Lineal y el Ridge
LosAlpesHistoryDFVlasso = LosAlpesHistorydf[['room type encoded','minimum nights','availability 365','number of reviews','neighbourhood group encoded','cancellation_policy encoded','neighbourhood encoded']]
# Generaremos un informe sobre el DF de la regresión Lineal y la regresión Ridge
profile = ProfileReport(LosAlpesHistoryDFV2)
profile.to_notebook_iframe()
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
LosAlpesHistoryDFV2['number of reviews'].describe(percentiles = [.25, .5, .75, .95, .99])
count 94973.000000 mean 27.545271 std 50.036680 min 0.000000 25% 1.000000 50% 7.000000 75% 30.000000 95% 126.000000 99% 235.000000 max 1024.000000 Name: number of reviews, dtype: float64
plt.figure(figsize = (28, 3))
plt.boxplot(LosAlpesHistoryDFV2['number of reviews'], vert = False)
plt.show()
# Ahora pasaremos a definir el set de entrenamiento y set de pruebas
lin_reg = LinearRegression()
x = LosAlpesHistoryDFV2[['instant_bookable', 'room type encoded','construction year','price','service fee','minimum nights','availability 365','neighbourhood group encoded','cancellation_policy encoded','neighbourhood encoded']]
y = LosAlpesHistoryDFV2['number of reviews']
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)
# Escalando caracteristicas
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
regr = LinearRegression()
regr.fit(X_train_scaled, Y_train)
LinearRegression()
regr.coef_
array([-0.01528907, -0.17061288, 0.13195494, -0.07941148, -0.05112678,
-2.42945799, 4.75721361, 0.96776219, -0.21131593, -2.27824987])
regr.intercept_
27.62672041635956
# Evaluación del Modelo
preds_train = regr.predict(X_train_scaled)
preds_test = regr.predict(X_test_scaled)
MAE_LinealRegretion=mean_absolute_error(Y_train, preds_train), mean_absolute_error(Y_test, preds_test)
MAE_LinealRegretion
(31.093547988783314, 30.85671681609628)
RMSE_LinealRegretion=np.sqrt(mean_squared_error(Y_train, preds_train)), np.sqrt(mean_squared_error(Y_test, preds_test))
RMSE_LinealRegretion
(49.753600629050645, 49.546126228626306)
plt.figure(figsize = (28, 3))
plt.boxplot((Y_test - preds_test), vert = False)
plt.grid()
plt.show()
worst_condition = np.where((Y_test - preds_test) > 3, True, False)
worst_cases = pd.concat([
X_test.loc[worst_condition],
Y_test.loc[worst_condition]
], axis = 1).reset_index(drop = True)
worst_cases['predictions'] = pd.Series(preds_test).loc[worst_condition].reset_index(drop = True)
worst_cases.shape
(7025, 12)
worst_cases
| instant_bookable | room type encoded | construction year | price | service fee | minimum nights | availability 365 | neighbourhood group encoded | cancellation_policy encoded | neighbourhood encoded | number of reviews | predictions | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | 0 | 2008.0 | 703.00 | 141.0 | 2.0 | 44.0 | 4 | 2 | 167 | 30.0 | 24.702726 |
| 1 | True | 3 | 2015.0 | 1.02 | 204.0 | 2.0 | 33.0 | 1 | 2 | 28 | 90.0 | 26.037481 |
| 2 | True | 0 | 2012.0 | 698.00 | 140.0 | 2.0 | 60.0 | 2 | 2 | 62 | 67.0 | 26.914470 |
| 3 | True | 3 | 2012.0 | 782.00 | 156.0 | 30.0 | 363.0 | 1 | 3 | 13 | 59.0 | 35.774765 |
| 4 | False | 3 | 2005.0 | 100.00 | 20.0 | 1.0 | 345.0 | 1 | 1 | 217 | 55.0 | 31.060535 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7020 | True | 3 | 2013.0 | 1.10 | 219.0 | 1.0 | 362.0 | 4 | 3 | 107 | 102.0 | 37.711804 |
| 7021 | True | 0 | 2020.0 | 212.00 | 42.0 | 3.0 | 53.0 | 5 | 3 | 168 | 79.0 | 25.970045 |
| 7022 | False | 0 | 2006.0 | 370.00 | 74.0 | 4.0 | 1.0 | 1 | 1 | 13 | 239.0 | 25.590953 |
| 7023 | False | 0 | 2017.0 | 449.00 | 90.0 | 4.0 | 6.0 | 2 | 2 | 205 | 89.0 | 20.371231 |
| 7024 | True | 3 | 2010.0 | 420.00 | 84.0 | 3.0 | 136.0 | 1 | 1 | 162 | 169.0 | 25.286041 |
7025 rows × 12 columns
poly_features = PolynomialFeatures(degree=3,include_bias=False)
x_poly = poly_features.fit_transform(X_train)
ridge_df = Ridge(alpha=1)
ridge_df.fit(x_poly,Y_train)
df_poly_training=ridge_df.predict(x_poly)
x_poly_test = poly_features.fit_transform(X_test)
ridge_df_test = Ridge(alpha=1)
ridge_df_test.fit(x_poly_test,Y_test)
Ridge(alpha=1)
df_poly_test=ridge_df_test.predict(x_poly_test)
MAE_RidgeRegretion=mean_absolute_error(Y_train, df_poly_training), mean_absolute_error(Y_test, df_poly_test)
MAE_RidgeRegretion
(29.54641094681216, 29.06540975928463)
RMSE_RidgeRegretion=np.sqrt(mean_squared_error(Y_train, df_poly_training)), np.sqrt(mean_squared_error(Y_test, df_poly_test))
RMSE_RidgeRegretion
(48.41898641139969, 48.0932730258384)
# Para lasso, debemos utilizar un set de variables más acotadas. Así que utilizaremos los campos definidos en el DF LosAlpesHistoryDFVlasso
xLasso = LosAlpesHistoryDFV2[['room type encoded','minimum nights','availability 365','neighbourhood group encoded','cancellation_policy encoded','neighbourhood encoded']]
yLasso = LosAlpesHistoryDFV2['number of reviews']
X_train, X_test, Y_train, Y_test = train_test_split(xLasso, yLasso, test_size = 0.3, random_state = 1)
lasso_features = PolynomialFeatures(degree=3,include_bias=False)
x_lasso = lasso_features.fit_transform(X_train)
lasso_df = Lasso(alpha=5)
lasso_df.fit(x_lasso,Y_train)
df_lasso_training=lasso_df.predict(x_lasso)
x_lasso_test = lasso_features.fit_transform(X_test)
lasso_df_test = Lasso(alpha=5)
lasso_df_test.fit(x_lasso_test,Y_test)
Lasso(alpha=5)
df_lasso_test=lasso_df_test.predict(x_lasso_test)
MAE_Laso=mean_absolute_error(Y_train, df_lasso_training), mean_absolute_error(Y_test, df_lasso_test)
MAE_Laso
(29.834082922903125, 29.311766051029927)
RMSE_Lasso=np.sqrt(mean_squared_error(Y_train, df_lasso_training)), np.sqrt(mean_squared_error(Y_test, df_lasso_test))
RMSE_Lasso
(48.74487729665361, 48.40488986379656)
# Finalmente compararemos los tres modelos con las variables almacenadas
MAE_LinealRegretion, MAE_RidgeRegretion, MAE_Laso
((31.093547988783314, 30.85671681609628), (29.54641094681216, 29.06540975928463), (29.834082922903125, 29.311766051029927))
RMSE_LinealRegretion, RMSE_RidgeRegretion, RMSE_Lasso
((49.753600629050645, 49.546126228626306), (48.41898641139969, 48.0932730258384), (48.74487729665361, 48.40488986379656))
# Dado que no existe grandes diferencias, como resultado de la experimentación entre LASO y RIDGE, tome la decisión de predecir el modelo con RIDGE dado que me permite
# utilizar la totalidad de variables del dataset nuevo. La diferencia entre lo entrenado y el test en MAE del Ridge es apenas 0,48 y en LASO es 0,34. Insisto, se que LASO es un poco mejor pero
# sacrificaré eso para poder utilizar todas las columnas del nuevo dataset.
# PUNTO 3: Utilice el mejor modelo obtenido para estimar la popularidad de los inmuebles próximos a publicarse. A partir de este resultado, apoye a la
# inmobiliaria en la definición del presupuesto de marketing que debe aprovisionar para promocionar los inmuebles menos populares sabiendo que la
# plataforma en la que se publican cobra por anuncio el 2% del precio del inmueble. Por ejemplo, si un inmueble tiene como precio $200, la plataforma
# cobrará por publicar un anuncio para el inmueble un total de $4. Usted es el encargado,a partir de los resultados de las predicciones, de definir el
# límite superior de comentarios para considerar un inmueble no popular. Explique esta decisión claramente así como el total de presupuesto que se
# debe aprovisionar acompañado del total de inmuebles a promocionar
# Vamos a iniciar realizando los mismos 11 pasos que se hicieron en la transformación de datos del primer dataFrame.
LosAlpesNewdf = pd.read_csv(r"C:\Users\LGARCIA\OneDrive - Valorem\LGARCIA\Documents\04. Estudio\05. Maestria\Ciencia Aplicada de Datos\CAD_Parcial1\Data\losalpes_new.csv")
LosAlpesNewdf.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 513 entries, 0 to 512 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 513 non-null int64 1 neighbourhood group 512 non-null object 2 neighbourhood 512 non-null object 3 lat 512 non-null float64 4 long 512 non-null float64 5 country 507 non-null object 6 instant_bookable 512 non-null object 7 cancellation_policy 512 non-null object 8 room type 512 non-null object 9 construction year 512 non-null float64 10 price 512 non-null object 11 service fee 511 non-null object 12 minimum nights 510 non-null float64 13 availability 365 511 non-null float64 dtypes: float64(5), int64(1), object(8) memory usage: 56.2+ KB
LosAlpesNewdf['neighbourhood group'].unique()
array(['Queens', 'Manhattan', 'Brooklyn', 'Bronx', 'Staten Island', nan],
dtype=object)
LosAlpesNewdf['neighbourhood group']=LosAlpesNewdf['neighbourhood group'].replace(np.nan, 'No Information')
LosAlpesNewdf['neighbourhood group']=LosAlpesNewdf['neighbourhood group'].apply(corregirneighbourhoodGroup1)
LosAlpesNewdf['neighbourhood group']=LosAlpesNewdf['neighbourhood group'].apply(corregirneighbourhoodGroup2)
LosAlpesNewdf['neighbourhood group'].unique()
array(['Queens', 'Manhattan', 'Brooklyn', 'Bronx', 'State Island',
'No Information'], dtype=object)
LosAlpesNewdf['neighbourhood']=LosAlpesNewdf['neighbourhood'].str.strip()
LosAlpesNewdf['country']="United States"
LosAlpesNewdf['instant_bookable']=LosAlpesNewdf['instant_bookable'].replace(np.nan, False)
LosAlpesNewdf['cancellation_policy']=LosAlpesNewdf['cancellation_policy'].replace(np.nan, "No Information")
LosAlpesNewdf['room type']=LosAlpesNewdf['room type'].replace(np.nan, "No Information")
LosAlpesNewdf['construction year']=LosAlpesNewdf['construction year'].apply(corregirAño)
LosAlpesNewdf['price']=LosAlpesNewdf['price'].str.slice_replace(stop=2, repl='')
LosAlpesNewdf['price']=LosAlpesNewdf['price'].replace(np.nan, '0.0')
LosAlpesNewdf['price']=LosAlpesNewdf['price'].replace(',', '.',regex=True)
LosAlpesNewdf['price']=LosAlpesNewdf['price'].astype(float, errors = 'raise')
LosAlpesNewdf['price'].unique()
array([519. , 987. , 999. , 497. , 820. , 1.11, 913. , 238. ,
930. , 822. , 901. , 249. , 997. , 888. , 264. , 263. ,
288. , 144. , 124. , 352. , 89. , 231. , 1.19, 649. ,
427. , 615. , 659. , 502. , 1.15, 827. , 695. , 129. ,
1.17, 1. , 943. , 155. , 842. , 150. , 984. , 432. ,
664. , 422. , 613. , 418. , 1.02, 118. , 931. , 177. ,
546. , 793. , 1.01, 157. , 559. , 158. , 898. , 1.12,
1.09, 700. , 1.1 , 614. , 965. , 527. , 1.04, 736. ,
312. , 274. , 308. , 642. , 329. , 488. , 592. , 1.2 ,
240. , 380. , 802. , 1.05, 672. , 1.06, 645. , 1.07,
843. , 761. , 634. , 808. , 192. , 401. , 367. , 171. ,
445. , 216. , 115. , 114. , 1.18, 990. , 500. , 394. ,
701. , 783. , 307. , 409. , 103. , 693. , 284. , 481. ,
662. , 937. , 838. , 716. , 976. , 159. , 834. , 339. ,
348. , 951. , 75. , 925. , 139. , 840. , 689. , 720. ,
749. , 601. , 928. , 327. , 964. , 289. , 137. , 1.08,
472. , 168. , 958. , 622. , 766. , 323. , 554. , 967. ,
300. , 56. , 265. , 1.14, 703. , 146. , 625. , 596. ,
724. , 821. , 166. , 214. , 705. , 854. , 201. , 444. ,
261. , 117. , 490. , 926. , 746. , 977. , 515. , 172. ,
234. , 813. , 884. , 354. , 578. , 521. , 303. , 453. ,
215. , 602. , 867. , 814. , 244. , 737. , 589. , 1.16,
544. , 795. , 807. , 70. , 383. , 185. , 536. , 301. ,
61. , 135. , 668. , 711. , 403. , 845. , 597. , 593. ,
525. , 64. , 430. , 341. , 282. , 829. , 580. , 290. ,
631. , 873. , 431. , 73. , 969. , 407. , 326. , 853. ,
207. , 983. , 337. , 220. , 357. , 881. , 83. , 851. ,
574. , 902. , 179. , 893. , 50. , 463. , 322. , 315. ,
346. , 520. , 286. , 518. , 489. , 387. , 398. , 696. ,
687. , 637. , 599. , 639. , 606. , 663. , 636. , 458. ,
304. , 730. , 498. , 595. , 332. , 979. , 62. , 228. ,
575. , 0. , 621. , 697. , 495. , 279. , 197. , 966. ,
465. , 524. , 547. , 361. , 970. , 363. , 141. , 712. ,
459. , 503. , 51. , 423. , 311. , 506. , 151. , 1.03,
514. , 694. , 674. , 774. , 978. , 392. , 753. , 100. ,
1.13, 562. , 176. , 619. , 154. , 143. , 464. , 877. ,
770. , 651. , 259. , 77. , 174. , 811. , 470. , 538. ,
438. , 68. , 988. , 60. , 287. , 181. , 309. , 134. ,
944. , 556. , 208. , 899. , 864. , 195. , 803. , 882. ,
478. , 573. , 219. , 892. , 390. , 153. , 833. , 570. ,
870. , 245. , 400. , 914. , 785. , 491. , 678. , 819. ,
773. , 537. , 273. , 938. , 837. , 734. , 440. , 164. ,
230. , 971. , 358. , 198. , 751. , 484. , 948. , 378. ,
366. , 156. , 828. , 745. , 775. , 861. , 345. , 69. ,
863. , 342. , 630. , 110. , 251. ])
LosAlpesNewdf['service fee']=LosAlpesNewdf['service fee'].str.slice_replace(stop=2, repl='')
LosAlpesNewdf['service fee']=LosAlpesNewdf['service fee'].replace(np.nan, '0.0')
LosAlpesNewdf['service fee']=LosAlpesNewdf['service fee'].replace(',', '.',regex=True)
LosAlpesNewdf['service fee']=LosAlpesNewdf['service fee'].astype(float, errors = 'raise')
LosAlpesNewdf['service fee'].unique()
array([104., 197., 200., 99., 164., 221., 183., 48., 186., 180., 50.,
199., 178., 53., 58., 29., 25., 70., 18., 46., 237., 130.,
85., 123., 132., 100., 230., 165., 139., 26., 234., 189., 31.,
168., 30., 86., 133., 84., 204., 24., 35., 109., 159., 203.,
112., 32., 225., 218., 140., 220., 193., 105., 208., 147., 62.,
55., 128., 66., 98., 118., 240., 76., 160., 210., 134., 212.,
129., 215., 169., 152., 127., 162., 38., 80., 73., 34., 89.,
43., 23., 238., 236., 198., 79., 157., 61., 82., 21., 57.,
96., 187., 143., 195., 167., 222., 68., 190., 15., 185., 28.,
138., 144., 150., 120., 65., 27., 94., 192., 124., 153., 111.,
60., 11., 228., 141., 224., 125., 119., 145., 214., 216., 33.,
171., 201., 40., 52., 149., 103., 202., 47., 163., 177., 71.,
116., 91., 173., 49., 223., 232., 161., 219., 14., 77., 37.,
235., 107., 12., 231., 142., 81., 13., 56., 166., 126., 175.,
239., 194., 41., 67., 44., 176., 17., 170., 115., 36., 179.,
233., 10., 207., 93., 64., 63., 69., 137., 121., 92., 146.,
229., 196., 0., 39., 209., 72., 101., 206., 135., 155., 78.,
151., 20., 227., 154., 213., 108., 88., 42., 205., 114., 174.,
136., 188., 97., 172., 211., 22.])
LosAlpesNewdf['minimum nights']=LosAlpesNewdf['minimum nights'].apply(corregirNegativo)
LosAlpesNewdf['availability 365']=LosAlpesNewdf['availability 365'].apply(corregirNegativo)
LosAlpesNewdf['availability 365']=LosAlpesNewdf['availability 365'].apply(corregirDisponibilidad)
LosAlpesNewdf.drop_duplicates(keep='first', inplace=True)
LosAlpesNewdf.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 513 entries, 0 to 512 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 513 non-null int64 1 neighbourhood group 513 non-null object 2 neighbourhood 512 non-null object 3 lat 512 non-null float64 4 long 512 non-null float64 5 country 513 non-null object 6 instant_bookable 513 non-null bool 7 cancellation_policy 513 non-null object 8 room type 513 non-null object 9 construction year 513 non-null float64 10 price 513 non-null float64 11 service fee 513 non-null float64 12 minimum nights 513 non-null float64 13 availability 365 513 non-null float64 dtypes: bool(1), float64(7), int64(1), object(5) memory usage: 56.6+ KB
LosAlpesNewdf['neighbourhood group encoded']=le.fit_transform(LosAlpesNewdf['neighbourhood group'])
LosAlpesNewdf['neighbourhood encoded']=le.fit_transform(LosAlpesNewdf['neighbourhood'])
LosAlpesNewdf['cancellation_policy encoded']=le.fit_transform(LosAlpesNewdf['cancellation_policy'])
LosAlpesNewdf['room type encoded']=le.fit_transform(LosAlpesNewdf['room type'])
LosAlpesNewdf.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 513 entries, 0 to 512 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 513 non-null int64 1 neighbourhood group 513 non-null object 2 neighbourhood 512 non-null object 3 lat 512 non-null float64 4 long 512 non-null float64 5 country 513 non-null object 6 instant_bookable 513 non-null bool 7 cancellation_policy 513 non-null object 8 room type 513 non-null object 9 construction year 513 non-null float64 10 price 513 non-null float64 11 service fee 513 non-null float64 12 minimum nights 513 non-null float64 13 availability 365 513 non-null float64 14 neighbourhood group encoded 513 non-null int32 15 neighbourhood encoded 513 non-null int32 16 cancellation_policy encoded 513 non-null int32 17 room type encoded 513 non-null int32 dtypes: bool(1), float64(7), int32(4), int64(1), object(5) memory usage: 64.6+ KB
PredictionDF = LosAlpesNewdf[['instant_bookable', 'room type encoded','construction year','price','service fee','minimum nights','availability 365','neighbourhood group encoded','cancellation_policy encoded','neighbourhood encoded']]
PredictionDF.drop_duplicates(keep='first', inplace=True)
PredictionDF.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 513 entries, 0 to 512 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 instant_bookable 513 non-null bool 1 room type encoded 513 non-null int32 2 construction year 513 non-null float64 3 price 513 non-null float64 4 service fee 513 non-null float64 5 minimum nights 513 non-null float64 6 availability 365 513 non-null float64 7 neighbourhood group encoded 513 non-null int32 8 cancellation_policy encoded 513 non-null int32 9 neighbourhood encoded 513 non-null int32 dtypes: bool(1), float64(5), int32(4) memory usage: 32.6 KB
# En este punto utilizaré la predicción Ridge, de acuerdo a lo expuesto en el punto anterior. Podría utilizar laso recortando la cantidad de variables que vamos a usar.
x_ridge_predict = poly_features.fit_transform(PredictionDF)
df_ridge_predict=ridge_df.predict(x_ridge_predict)
PredictionDF['Predicted Reviews']=df_ridge_predict
PredictionDF['Predicted Reviews']=PredictionDF['Predicted Reviews'].astype(int, errors = 'raise')
PredictionDF.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 513 entries, 0 to 512 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 instant_bookable 513 non-null bool 1 room type encoded 513 non-null int32 2 construction year 513 non-null float64 3 price 513 non-null float64 4 service fee 513 non-null float64 5 minimum nights 513 non-null float64 6 availability 365 513 non-null float64 7 neighbourhood group encoded 513 non-null int32 8 cancellation_policy encoded 513 non-null int32 9 neighbourhood encoded 513 non-null int32 10 Predicted Reviews 513 non-null int32 dtypes: bool(1), float64(5), int32(5) memory usage: 34.6 KB
# Depuro un poco los datos predichos, quitandole ceros.
PredictionDF['Predicted Reviews']=PredictionDF['Predicted Reviews'].apply(corregirNegativo)
PredictionDF.sort_values('Predicted Reviews', ascending=True)
| instant_bookable | room type encoded | construction year | price | service fee | minimum nights | availability 365 | neighbourhood group encoded | cancellation_policy encoded | neighbourhood encoded | Predicted Reviews | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 431 | True | 3 | 2022.0 | 892.00 | 178.0 | 91.0 | 34.0 | 2 | 2 | 61 | 0 |
| 339 | False | 3 | 2010.0 | 547.00 | 109.0 | 1.0 | 0.0 | 5 | 2 | 31 | 0 |
| 33 | True | 3 | 2017.0 | 1.17 | 234.0 | 3.0 | 0.0 | 1 | 3 | 21 | 3 |
| 43 | True | 3 | 2009.0 | 613.00 | 123.0 | 30.0 | 0.0 | 4 | 3 | 20 | 5 |
| 426 | True | 4 | 2009.0 | 882.00 | 176.0 | 1.0 | 0.0 | 4 | 3 | 1 | 7 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 165 | True | 0 | 2010.0 | 166.00 | 33.0 | 2.0 | 301.0 | 0 | 2 | 65 | 57 |
| 14 | True | 0 | 2017.0 | 888.00 | 178.0 | 3.0 | 166.0 | 1 | 3 | 5 | 60 |
| 346 | True | 1 | 2003.0 | 1.11 | 221.0 | 1.0 | 346.0 | 2 | 3 | 89 | 104 |
| 176 | False | 1 | 2019.0 | 490.00 | 98.0 | 1.0 | 282.0 | 2 | 2 | 89 | 110 |
| 323 | False | 2 | 2002.0 | 0.00 | 0.0 | 0.0 | 0.0 | 3 | 0 | 94 | 114 |
513 rows × 11 columns
# Incluyo una nueva columna que será el costo de publicitar el inmueble, que es el 2% del valor. Tambien saco la media de los valores predichos y todo inmueble que no supere este rango, va a denominarse
# inmueble poco popular.
PredictionDF['Costo Publicacion']=PredictionDF['price']*0.02
vMean = PredictionDF['Predicted Reviews'].mean()
vMean
29.621832358674464
PredictionDF
| instant_bookable | room type encoded | construction year | price | service fee | minimum nights | availability 365 | neighbourhood group encoded | cancellation_policy encoded | neighbourhood encoded | Predicted Reviews | Costo Publicacion | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | 3 | 2006.0 | 519.00 | 104.0 | 1.0 | 0.0 | 4 | 2 | 63 | 20 | 10.3800 |
| 1 | False | 0 | 2006.0 | 987.00 | 197.0 | 2.0 | 42.0 | 2 | 2 | 15 | 33 | 19.7400 |
| 2 | True | 3 | 2018.0 | 999.00 | 200.0 | 2.0 | 0.0 | 4 | 1 | 24 | 19 | 19.9800 |
| 3 | True | 3 | 2011.0 | 497.00 | 99.0 | 4.0 | 0.0 | 2 | 3 | 38 | 19 | 9.9400 |
| 4 | True | 0 | 2022.0 | 820.00 | 164.0 | 4.0 | 365.0 | 2 | 3 | 85 | 24 | 16.4000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 508 | False | 3 | 2014.0 | 103.00 | 21.0 | 2.0 | 0.0 | 1 | 2 | 3 | 10 | 2.0600 |
| 509 | True | 3 | 2008.0 | 110.00 | 22.0 | 2.0 | 21.0 | 1 | 2 | 3 | 20 | 2.2000 |
| 510 | False | 0 | 2022.0 | 251.00 | 50.0 | 30.0 | 335.0 | 1 | 3 | 90 | 27 | 5.0200 |
| 511 | True | 3 | 2022.0 | 1.01 | 201.0 | 1.0 | 41.0 | 1 | 3 | 35 | 28 | 0.0202 |
| 512 | True | 3 | 2016.0 | 1.05 | 209.0 | 1.0 | 162.0 | 4 | 3 | 71 | 52 | 0.0210 |
513 rows × 12 columns
# Al final creo un nuevo dataframe con los inmuebles que poco populares (numero de reviews menores a la media)
flt = PredictionDF['Predicted Reviews']<vMean
InmuebleaPublicarDF = PredictionDF[flt]
InmuebleaPublicarDF
| instant_bookable | room type encoded | construction year | price | service fee | minimum nights | availability 365 | neighbourhood group encoded | cancellation_policy encoded | neighbourhood encoded | Predicted Reviews | Costo Publicacion | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | 3 | 2006.0 | 519.00 | 104.0 | 1.0 | 0.0 | 4 | 2 | 63 | 20 | 10.3800 |
| 2 | True | 3 | 2018.0 | 999.00 | 200.0 | 2.0 | 0.0 | 4 | 1 | 24 | 19 | 19.9800 |
| 3 | True | 3 | 2011.0 | 497.00 | 99.0 | 4.0 | 0.0 | 2 | 3 | 38 | 19 | 9.9400 |
| 4 | True | 0 | 2022.0 | 820.00 | 164.0 | 4.0 | 365.0 | 2 | 3 | 85 | 24 | 16.4000 |
| 6 | True | 0 | 2014.0 | 913.00 | 183.0 | 7.0 | 0.0 | 1 | 2 | 90 | 15 | 18.2600 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 507 | False | 4 | 2009.0 | 630.00 | 126.0 | 27.0 | 365.0 | 2 | 2 | 13 | 10 | 12.6000 |
| 508 | False | 3 | 2014.0 | 103.00 | 21.0 | 2.0 | 0.0 | 1 | 2 | 3 | 10 | 2.0600 |
| 509 | True | 3 | 2008.0 | 110.00 | 22.0 | 2.0 | 21.0 | 1 | 2 | 3 | 20 | 2.2000 |
| 510 | False | 0 | 2022.0 | 251.00 | 50.0 | 30.0 | 335.0 | 1 | 3 | 90 | 27 | 5.0200 |
| 511 | True | 3 | 2022.0 | 1.01 | 201.0 | 1.0 | 41.0 | 1 | 3 | 35 | 28 | 0.0202 |
266 rows × 12 columns
# para encontrar el presupuesto del area, sumo el costo y cuento la cantidad de lineas del nuevo dataframe.
Presupuesto=InmuebleaPublicarDF['Costo Publicacion'].sum()
Presupuesto
2249.8884
InmuebleaPublicarDF.shape
(266, 12)
# La respuesta es: Presupuesto: 2250 dolares para publicitar 266 inmuebles menos populares.